twitter data mining script in python











up vote
0
down vote

favorite












I have written a simple script that searches twitter for keywords and saves them to a csv file if they contain those words. It can be found on my github here.



How can I improve this code to generally be more efficient and be up to coding standards ?



"""
Script that goes through english tweets that are filtered by security words and posted in the last one hour and stores the polarity, id, date time, query, username and text into a csv file.
"""

import tweepy
import datetime, time, csv, codecs
from textblob import TextBlob
import cleanit

##setting authorization stuff for twitter##
consumer_key = "xxx"
consumer_secret = "xxx"
access_token = "xxx"
access_token_secret = "xxx"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

##initializing lists##
big_list =
text_list =
id_list =
name_list =
created_list =
query_list =
polarityy =

t = 0

#use words in this list as search terms for tweepy.cursor function
security_words = ['phishing','dos','botnet','xss','smb','wannacry','heartbleed','ransomware','trojan','spyware','exploit','virus','malware','mitm']

# if word in security words list and double_meaning_words list if text also contains word from gen words list, if it does store if not discard
double_meaning_words = ['petya','smb','dos','infosec','hacker','backdoor']
gen_words = ["attack","security","hit","detected","protected","injection","data","exploit", "router", 'ransomware', 'phishing', 'wannacry', 'security']

def storing_data(stat):
##store id,username,datetime,text and polarity for filtered tweets in csv##
text_list.append(str(cleanit.tweet_cleaner_updated(status.text)).encode("utf-8"))
id_list.append(str(status.id)) # append id number to list
name_list.append(str(status.user.screen_name)) # append user name to list
created_list.append((status.created_at).strftime('%c')) # append date time to list
analysis = TextBlob(status.text)
analysis = analysis.sentiment.polarity # use textblob on text to get sentiment score of text

if analysis >= -1 and analysis <= 0: # append sentiment score to list
polarityy.append("4")
else:
polarityy.append("0")

def rejects(stat):
##store tweets which do not pass filters into csv##
with open('rejects.csv', "a", newline='', encoding='utf-8') as rejectfile:
logger = csv.writer(rejectfile)
logger.writerow([status.text])


while True:
print ('running', datetime.datetime.now())
with open('sec_tweet_dataset_5.csv', "a", newline='', encoding='utf-8') as logfile:
logger = csv.writer(logfile)
for i in security_words:
alex =
for status in tweepy.Cursor(api.search, i,lang="en").items(40): #search twitter for word in security word list in english
if (status.retweeted == False) or ('RT @' not in status.text): #is tweet is retweeted dont store it
if i in double_meaning_words and i in status.text: #if search term being used from security words list also in double meaning words check if it also contains word -
for words in gen_words: # - from gen_words list. If it does continue to storing if not dont store.
if words in status.text:
storing_data(status)
break
else:
rejects(status)
else:
storing_data(status)

rejects(status)

while t < len(polarityy):
alex = ([polarityy[t],id_list[t],created_list[t],name_list[t],text_list[int(t)]])
t += 1
logger.writerow(alex)
time.sleep(1800)









share|improve this question







New contributor




dmnte is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
























    up vote
    0
    down vote

    favorite












    I have written a simple script that searches twitter for keywords and saves them to a csv file if they contain those words. It can be found on my github here.



    How can I improve this code to generally be more efficient and be up to coding standards ?



    """
    Script that goes through english tweets that are filtered by security words and posted in the last one hour and stores the polarity, id, date time, query, username and text into a csv file.
    """

    import tweepy
    import datetime, time, csv, codecs
    from textblob import TextBlob
    import cleanit

    ##setting authorization stuff for twitter##
    consumer_key = "xxx"
    consumer_secret = "xxx"
    access_token = "xxx"
    access_token_secret = "xxx"
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth)

    ##initializing lists##
    big_list =
    text_list =
    id_list =
    name_list =
    created_list =
    query_list =
    polarityy =

    t = 0

    #use words in this list as search terms for tweepy.cursor function
    security_words = ['phishing','dos','botnet','xss','smb','wannacry','heartbleed','ransomware','trojan','spyware','exploit','virus','malware','mitm']

    # if word in security words list and double_meaning_words list if text also contains word from gen words list, if it does store if not discard
    double_meaning_words = ['petya','smb','dos','infosec','hacker','backdoor']
    gen_words = ["attack","security","hit","detected","protected","injection","data","exploit", "router", 'ransomware', 'phishing', 'wannacry', 'security']

    def storing_data(stat):
    ##store id,username,datetime,text and polarity for filtered tweets in csv##
    text_list.append(str(cleanit.tweet_cleaner_updated(status.text)).encode("utf-8"))
    id_list.append(str(status.id)) # append id number to list
    name_list.append(str(status.user.screen_name)) # append user name to list
    created_list.append((status.created_at).strftime('%c')) # append date time to list
    analysis = TextBlob(status.text)
    analysis = analysis.sentiment.polarity # use textblob on text to get sentiment score of text

    if analysis >= -1 and analysis <= 0: # append sentiment score to list
    polarityy.append("4")
    else:
    polarityy.append("0")

    def rejects(stat):
    ##store tweets which do not pass filters into csv##
    with open('rejects.csv', "a", newline='', encoding='utf-8') as rejectfile:
    logger = csv.writer(rejectfile)
    logger.writerow([status.text])


    while True:
    print ('running', datetime.datetime.now())
    with open('sec_tweet_dataset_5.csv', "a", newline='', encoding='utf-8') as logfile:
    logger = csv.writer(logfile)
    for i in security_words:
    alex =
    for status in tweepy.Cursor(api.search, i,lang="en").items(40): #search twitter for word in security word list in english
    if (status.retweeted == False) or ('RT @' not in status.text): #is tweet is retweeted dont store it
    if i in double_meaning_words and i in status.text: #if search term being used from security words list also in double meaning words check if it also contains word -
    for words in gen_words: # - from gen_words list. If it does continue to storing if not dont store.
    if words in status.text:
    storing_data(status)
    break
    else:
    rejects(status)
    else:
    storing_data(status)

    rejects(status)

    while t < len(polarityy):
    alex = ([polarityy[t],id_list[t],created_list[t],name_list[t],text_list[int(t)]])
    t += 1
    logger.writerow(alex)
    time.sleep(1800)









    share|improve this question







    New contributor




    dmnte is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.






















      up vote
      0
      down vote

      favorite









      up vote
      0
      down vote

      favorite











      I have written a simple script that searches twitter for keywords and saves them to a csv file if they contain those words. It can be found on my github here.



      How can I improve this code to generally be more efficient and be up to coding standards ?



      """
      Script that goes through english tweets that are filtered by security words and posted in the last one hour and stores the polarity, id, date time, query, username and text into a csv file.
      """

      import tweepy
      import datetime, time, csv, codecs
      from textblob import TextBlob
      import cleanit

      ##setting authorization stuff for twitter##
      consumer_key = "xxx"
      consumer_secret = "xxx"
      access_token = "xxx"
      access_token_secret = "xxx"
      auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
      auth.set_access_token(access_token, access_token_secret)
      api = tweepy.API(auth)

      ##initializing lists##
      big_list =
      text_list =
      id_list =
      name_list =
      created_list =
      query_list =
      polarityy =

      t = 0

      #use words in this list as search terms for tweepy.cursor function
      security_words = ['phishing','dos','botnet','xss','smb','wannacry','heartbleed','ransomware','trojan','spyware','exploit','virus','malware','mitm']

      # if word in security words list and double_meaning_words list if text also contains word from gen words list, if it does store if not discard
      double_meaning_words = ['petya','smb','dos','infosec','hacker','backdoor']
      gen_words = ["attack","security","hit","detected","protected","injection","data","exploit", "router", 'ransomware', 'phishing', 'wannacry', 'security']

      def storing_data(stat):
      ##store id,username,datetime,text and polarity for filtered tweets in csv##
      text_list.append(str(cleanit.tweet_cleaner_updated(status.text)).encode("utf-8"))
      id_list.append(str(status.id)) # append id number to list
      name_list.append(str(status.user.screen_name)) # append user name to list
      created_list.append((status.created_at).strftime('%c')) # append date time to list
      analysis = TextBlob(status.text)
      analysis = analysis.sentiment.polarity # use textblob on text to get sentiment score of text

      if analysis >= -1 and analysis <= 0: # append sentiment score to list
      polarityy.append("4")
      else:
      polarityy.append("0")

      def rejects(stat):
      ##store tweets which do not pass filters into csv##
      with open('rejects.csv', "a", newline='', encoding='utf-8') as rejectfile:
      logger = csv.writer(rejectfile)
      logger.writerow([status.text])


      while True:
      print ('running', datetime.datetime.now())
      with open('sec_tweet_dataset_5.csv', "a", newline='', encoding='utf-8') as logfile:
      logger = csv.writer(logfile)
      for i in security_words:
      alex =
      for status in tweepy.Cursor(api.search, i,lang="en").items(40): #search twitter for word in security word list in english
      if (status.retweeted == False) or ('RT @' not in status.text): #is tweet is retweeted dont store it
      if i in double_meaning_words and i in status.text: #if search term being used from security words list also in double meaning words check if it also contains word -
      for words in gen_words: # - from gen_words list. If it does continue to storing if not dont store.
      if words in status.text:
      storing_data(status)
      break
      else:
      rejects(status)
      else:
      storing_data(status)

      rejects(status)

      while t < len(polarityy):
      alex = ([polarityy[t],id_list[t],created_list[t],name_list[t],text_list[int(t)]])
      t += 1
      logger.writerow(alex)
      time.sleep(1800)









      share|improve this question







      New contributor




      dmnte is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.











      I have written a simple script that searches twitter for keywords and saves them to a csv file if they contain those words. It can be found on my github here.



      How can I improve this code to generally be more efficient and be up to coding standards ?



      """
      Script that goes through english tweets that are filtered by security words and posted in the last one hour and stores the polarity, id, date time, query, username and text into a csv file.
      """

      import tweepy
      import datetime, time, csv, codecs
      from textblob import TextBlob
      import cleanit

      ##setting authorization stuff for twitter##
      consumer_key = "xxx"
      consumer_secret = "xxx"
      access_token = "xxx"
      access_token_secret = "xxx"
      auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
      auth.set_access_token(access_token, access_token_secret)
      api = tweepy.API(auth)

      ##initializing lists##
      big_list =
      text_list =
      id_list =
      name_list =
      created_list =
      query_list =
      polarityy =

      t = 0

      #use words in this list as search terms for tweepy.cursor function
      security_words = ['phishing','dos','botnet','xss','smb','wannacry','heartbleed','ransomware','trojan','spyware','exploit','virus','malware','mitm']

      # if word in security words list and double_meaning_words list if text also contains word from gen words list, if it does store if not discard
      double_meaning_words = ['petya','smb','dos','infosec','hacker','backdoor']
      gen_words = ["attack","security","hit","detected","protected","injection","data","exploit", "router", 'ransomware', 'phishing', 'wannacry', 'security']

      def storing_data(stat):
      ##store id,username,datetime,text and polarity for filtered tweets in csv##
      text_list.append(str(cleanit.tweet_cleaner_updated(status.text)).encode("utf-8"))
      id_list.append(str(status.id)) # append id number to list
      name_list.append(str(status.user.screen_name)) # append user name to list
      created_list.append((status.created_at).strftime('%c')) # append date time to list
      analysis = TextBlob(status.text)
      analysis = analysis.sentiment.polarity # use textblob on text to get sentiment score of text

      if analysis >= -1 and analysis <= 0: # append sentiment score to list
      polarityy.append("4")
      else:
      polarityy.append("0")

      def rejects(stat):
      ##store tweets which do not pass filters into csv##
      with open('rejects.csv', "a", newline='', encoding='utf-8') as rejectfile:
      logger = csv.writer(rejectfile)
      logger.writerow([status.text])


      while True:
      print ('running', datetime.datetime.now())
      with open('sec_tweet_dataset_5.csv', "a", newline='', encoding='utf-8') as logfile:
      logger = csv.writer(logfile)
      for i in security_words:
      alex =
      for status in tweepy.Cursor(api.search, i,lang="en").items(40): #search twitter for word in security word list in english
      if (status.retweeted == False) or ('RT @' not in status.text): #is tweet is retweeted dont store it
      if i in double_meaning_words and i in status.text: #if search term being used from security words list also in double meaning words check if it also contains word -
      for words in gen_words: # - from gen_words list. If it does continue to storing if not dont store.
      if words in status.text:
      storing_data(status)
      break
      else:
      rejects(status)
      else:
      storing_data(status)

      rejects(status)

      while t < len(polarityy):
      alex = ([polarityy[t],id_list[t],created_list[t],name_list[t],text_list[int(t)]])
      t += 1
      logger.writerow(alex)
      time.sleep(1800)






      python python-3.x pandas twitter






      share|improve this question







      New contributor




      dmnte is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.











      share|improve this question







      New contributor




      dmnte is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.









      share|improve this question




      share|improve this question






      New contributor




      dmnte is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.









      asked 18 mins ago









      dmnte

      1




      1




      New contributor




      dmnte is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.





      New contributor





      dmnte is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.






      dmnte is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.



























          active

          oldest

          votes











          Your Answer





          StackExchange.ifUsing("editor", function () {
          return StackExchange.using("mathjaxEditing", function () {
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
          });
          });
          }, "mathjax-editing");

          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "196"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          convertImagesToLinks: false,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });






          dmnte is a new contributor. Be nice, and check out our Code of Conduct.










          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f208740%2ftwitter-data-mining-script-in-python%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown






























          active

          oldest

          votes













          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes








          dmnte is a new contributor. Be nice, and check out our Code of Conduct.










          draft saved

          draft discarded


















          dmnte is a new contributor. Be nice, and check out our Code of Conduct.













          dmnte is a new contributor. Be nice, and check out our Code of Conduct.












          dmnte is a new contributor. Be nice, and check out our Code of Conduct.
















          Thanks for contributing an answer to Code Review Stack Exchange!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          Use MathJax to format equations. MathJax reference.


          To learn more, see our tips on writing great answers.





          Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


          Please pay close attention to the following guidance:


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f208740%2ftwitter-data-mining-script-in-python%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Ottavio Pratesi

          Tricia Helfer

          15 giugno