Calculate the average word vector of a tweet using word2vec
up vote
0
down vote
favorite
I have a script that predicts the sentiments of a list of tweets as either positive or negative.
My biggest issue is when I am calculating the average word-vector of a tweet using two for loops. This is slow, and when I have too many tweets it consumes all my memory. My function is creating a matrix that i will use to predict the sentiment (positive or negative) of tweets.
Here is my code for creating the matrix:
def create_X(list_of_tweets, w2v, features):
X = np.zeros((len(list_of_tweets),features))
for indeks, tweet in enumerate(list_of_tweets):
for word in tweet:
try:
X[indeks,:] = X[indeks,:] + model_tot.wv[str(word)]
except:
pass
N = len(tweet)
if N>0: #in case tweet is empty
X[indeks] = X[indeks]/N #taking average of word vectors
return X
Here is the complete script:
import numpy as np
import gensim
from gensim.models import Word2Vec
import csv
from sklearn import naive_bayes as nb
from sklearn.linear_model import LogisticRegression
def open_file(fileName):
with open(str(fileName), "r", encoding="utf8") as sample:
s = sample.readlines()
return s
def create_X(list_of_tweets, w2v, features):
X = np.zeros((len(list_of_tweets),features))
for indeks, tweet in enumerate(list_of_tweets):
for word in tweet:
try:
X[indeks,:] = X[indeks,:] + model_tot.wv[str(word)]
except:
pass
N = len(tweet)
if N>0:
X[indeks] = X[indeks]/N
return X
def processTrainingData(list_of_tweets):
list_of_tweets = list(set(list_of_tweets)) # remove duplicates
list_of_tweets = [gensim.utils.simple_preprocess(line) for line in list_of_tweets] # simple preprocessing
return list_of_tweets
def createWordEmbedding(list_of_tweets, features, epoc):
model = Word2Vec(list_of_tweets, size=features, window=5, min_count=1, workers=4)
model.train(list_of_tweets, total_examples=len(list_of_tweets), epochs=epoc)
return model
def save_csv(fileName, test_y):
ids = np.arange(len(test_y))
with open(fileName, 'w') as csvfile:
tempwriter = csv.writer(csvfile)
tempwriter.writerow(["Id","Prediction"])
count = 0
for row in test_y:
if row == 0:
row = -1
tempwriter.writerow([(ids[count])+1,str(row)])
count = count + 1
def train(method, x, y, x_test):
met = method.fit(x,y)
test_y = met.predict(x_test)
return test_y
features = 350
epoc = 50
spos = open_file("train_pos.txt")
sneg = open_file("train_neg.txt")
spos = processTrainingData(spos)
sneg = processTrainingData(sneg)
y = [1]*len(spos)+[0]*len(sneg)
stotal = spos+sneg
model_tot = createWordEmbedding(stotal, features, epoc)
X = create_X(stotal,model_tot, features)
testd = open_file("test_data.txt")
testd = [gensim.utils.simple_preprocess(line) for line in testd]
model_test = createWordEmbedding(testd, features, epoc)
X_test = create_X(testd,model_test, features)
# Build logistic regression classifiers to identify the polarity of words
test_y = train(LogisticRegression(), X, y, X_test)
# Build naive bayes classifiers to identify the polarity of words
test_y_nb = train(nb.GaussianNB(), X, y, X_test) # this one isn't working
save_csv('test_resultLR.csv', test_y)
save_csv('test_resultNB.csv', test_y_nb)
What I primarily could use help in, is coming up with code that uses less memory, and an extra bonus would be faster code. I was thinking I could do some list comprehension, but I am unable to figure out a good way to do that.
model_tot.wv[str(word)]
gives a vector of length = features:
model_tot.wv[str(word)]
python machine-learning memory-optimization
New contributor
add a comment |
up vote
0
down vote
favorite
I have a script that predicts the sentiments of a list of tweets as either positive or negative.
My biggest issue is when I am calculating the average word-vector of a tweet using two for loops. This is slow, and when I have too many tweets it consumes all my memory. My function is creating a matrix that i will use to predict the sentiment (positive or negative) of tweets.
Here is my code for creating the matrix:
def create_X(list_of_tweets, w2v, features):
X = np.zeros((len(list_of_tweets),features))
for indeks, tweet in enumerate(list_of_tweets):
for word in tweet:
try:
X[indeks,:] = X[indeks,:] + model_tot.wv[str(word)]
except:
pass
N = len(tweet)
if N>0: #in case tweet is empty
X[indeks] = X[indeks]/N #taking average of word vectors
return X
Here is the complete script:
import numpy as np
import gensim
from gensim.models import Word2Vec
import csv
from sklearn import naive_bayes as nb
from sklearn.linear_model import LogisticRegression
def open_file(fileName):
with open(str(fileName), "r", encoding="utf8") as sample:
s = sample.readlines()
return s
def create_X(list_of_tweets, w2v, features):
X = np.zeros((len(list_of_tweets),features))
for indeks, tweet in enumerate(list_of_tweets):
for word in tweet:
try:
X[indeks,:] = X[indeks,:] + model_tot.wv[str(word)]
except:
pass
N = len(tweet)
if N>0:
X[indeks] = X[indeks]/N
return X
def processTrainingData(list_of_tweets):
list_of_tweets = list(set(list_of_tweets)) # remove duplicates
list_of_tweets = [gensim.utils.simple_preprocess(line) for line in list_of_tweets] # simple preprocessing
return list_of_tweets
def createWordEmbedding(list_of_tweets, features, epoc):
model = Word2Vec(list_of_tweets, size=features, window=5, min_count=1, workers=4)
model.train(list_of_tweets, total_examples=len(list_of_tweets), epochs=epoc)
return model
def save_csv(fileName, test_y):
ids = np.arange(len(test_y))
with open(fileName, 'w') as csvfile:
tempwriter = csv.writer(csvfile)
tempwriter.writerow(["Id","Prediction"])
count = 0
for row in test_y:
if row == 0:
row = -1
tempwriter.writerow([(ids[count])+1,str(row)])
count = count + 1
def train(method, x, y, x_test):
met = method.fit(x,y)
test_y = met.predict(x_test)
return test_y
features = 350
epoc = 50
spos = open_file("train_pos.txt")
sneg = open_file("train_neg.txt")
spos = processTrainingData(spos)
sneg = processTrainingData(sneg)
y = [1]*len(spos)+[0]*len(sneg)
stotal = spos+sneg
model_tot = createWordEmbedding(stotal, features, epoc)
X = create_X(stotal,model_tot, features)
testd = open_file("test_data.txt")
testd = [gensim.utils.simple_preprocess(line) for line in testd]
model_test = createWordEmbedding(testd, features, epoc)
X_test = create_X(testd,model_test, features)
# Build logistic regression classifiers to identify the polarity of words
test_y = train(LogisticRegression(), X, y, X_test)
# Build naive bayes classifiers to identify the polarity of words
test_y_nb = train(nb.GaussianNB(), X, y, X_test) # this one isn't working
save_csv('test_resultLR.csv', test_y)
save_csv('test_resultNB.csv', test_y_nb)
What I primarily could use help in, is coming up with code that uses less memory, and an extra bonus would be faster code. I was thinking I could do some list comprehension, but I am unable to figure out a good way to do that.
model_tot.wv[str(word)]
gives a vector of length = features:
model_tot.wv[str(word)]
python machine-learning memory-optimization
New contributor
Hi Bamsebu, here at code review, we review working and complete. We aren't here to help you solve bugs with your code.
– Quill
7 hours ago
@Quill, Thanks for letting me know! I may have misunderstood the rules here or I have been a bit unclear, but my code is working. The problem is that it is slow and when the number of tweets approaches a million, my pc is in agony. If that's not the type of bugs that's meant for this site, i will search elsewhere.
– bamsebu
7 hours ago
Does it work okay for small inputs, but then exceeds time/memory limits for larger inputs? If so, that should be okay for this site. Refer to the discussion on this meta post
– Sᴀᴍ Onᴇᴌᴀ
58 mins ago
add a comment |
up vote
0
down vote
favorite
up vote
0
down vote
favorite
I have a script that predicts the sentiments of a list of tweets as either positive or negative.
My biggest issue is when I am calculating the average word-vector of a tweet using two for loops. This is slow, and when I have too many tweets it consumes all my memory. My function is creating a matrix that i will use to predict the sentiment (positive or negative) of tweets.
Here is my code for creating the matrix:
def create_X(list_of_tweets, w2v, features):
X = np.zeros((len(list_of_tweets),features))
for indeks, tweet in enumerate(list_of_tweets):
for word in tweet:
try:
X[indeks,:] = X[indeks,:] + model_tot.wv[str(word)]
except:
pass
N = len(tweet)
if N>0: #in case tweet is empty
X[indeks] = X[indeks]/N #taking average of word vectors
return X
Here is the complete script:
import numpy as np
import gensim
from gensim.models import Word2Vec
import csv
from sklearn import naive_bayes as nb
from sklearn.linear_model import LogisticRegression
def open_file(fileName):
with open(str(fileName), "r", encoding="utf8") as sample:
s = sample.readlines()
return s
def create_X(list_of_tweets, w2v, features):
X = np.zeros((len(list_of_tweets),features))
for indeks, tweet in enumerate(list_of_tweets):
for word in tweet:
try:
X[indeks,:] = X[indeks,:] + model_tot.wv[str(word)]
except:
pass
N = len(tweet)
if N>0:
X[indeks] = X[indeks]/N
return X
def processTrainingData(list_of_tweets):
list_of_tweets = list(set(list_of_tweets)) # remove duplicates
list_of_tweets = [gensim.utils.simple_preprocess(line) for line in list_of_tweets] # simple preprocessing
return list_of_tweets
def createWordEmbedding(list_of_tweets, features, epoc):
model = Word2Vec(list_of_tweets, size=features, window=5, min_count=1, workers=4)
model.train(list_of_tweets, total_examples=len(list_of_tweets), epochs=epoc)
return model
def save_csv(fileName, test_y):
ids = np.arange(len(test_y))
with open(fileName, 'w') as csvfile:
tempwriter = csv.writer(csvfile)
tempwriter.writerow(["Id","Prediction"])
count = 0
for row in test_y:
if row == 0:
row = -1
tempwriter.writerow([(ids[count])+1,str(row)])
count = count + 1
def train(method, x, y, x_test):
met = method.fit(x,y)
test_y = met.predict(x_test)
return test_y
features = 350
epoc = 50
spos = open_file("train_pos.txt")
sneg = open_file("train_neg.txt")
spos = processTrainingData(spos)
sneg = processTrainingData(sneg)
y = [1]*len(spos)+[0]*len(sneg)
stotal = spos+sneg
model_tot = createWordEmbedding(stotal, features, epoc)
X = create_X(stotal,model_tot, features)
testd = open_file("test_data.txt")
testd = [gensim.utils.simple_preprocess(line) for line in testd]
model_test = createWordEmbedding(testd, features, epoc)
X_test = create_X(testd,model_test, features)
# Build logistic regression classifiers to identify the polarity of words
test_y = train(LogisticRegression(), X, y, X_test)
# Build naive bayes classifiers to identify the polarity of words
test_y_nb = train(nb.GaussianNB(), X, y, X_test) # this one isn't working
save_csv('test_resultLR.csv', test_y)
save_csv('test_resultNB.csv', test_y_nb)
What I primarily could use help in, is coming up with code that uses less memory, and an extra bonus would be faster code. I was thinking I could do some list comprehension, but I am unable to figure out a good way to do that.
model_tot.wv[str(word)]
gives a vector of length = features:
model_tot.wv[str(word)]
python machine-learning memory-optimization
New contributor
I have a script that predicts the sentiments of a list of tweets as either positive or negative.
My biggest issue is when I am calculating the average word-vector of a tweet using two for loops. This is slow, and when I have too many tweets it consumes all my memory. My function is creating a matrix that i will use to predict the sentiment (positive or negative) of tweets.
Here is my code for creating the matrix:
def create_X(list_of_tweets, w2v, features):
X = np.zeros((len(list_of_tweets),features))
for indeks, tweet in enumerate(list_of_tweets):
for word in tweet:
try:
X[indeks,:] = X[indeks,:] + model_tot.wv[str(word)]
except:
pass
N = len(tweet)
if N>0: #in case tweet is empty
X[indeks] = X[indeks]/N #taking average of word vectors
return X
Here is the complete script:
import numpy as np
import gensim
from gensim.models import Word2Vec
import csv
from sklearn import naive_bayes as nb
from sklearn.linear_model import LogisticRegression
def open_file(fileName):
with open(str(fileName), "r", encoding="utf8") as sample:
s = sample.readlines()
return s
def create_X(list_of_tweets, w2v, features):
X = np.zeros((len(list_of_tweets),features))
for indeks, tweet in enumerate(list_of_tweets):
for word in tweet:
try:
X[indeks,:] = X[indeks,:] + model_tot.wv[str(word)]
except:
pass
N = len(tweet)
if N>0:
X[indeks] = X[indeks]/N
return X
def processTrainingData(list_of_tweets):
list_of_tweets = list(set(list_of_tweets)) # remove duplicates
list_of_tweets = [gensim.utils.simple_preprocess(line) for line in list_of_tweets] # simple preprocessing
return list_of_tweets
def createWordEmbedding(list_of_tweets, features, epoc):
model = Word2Vec(list_of_tweets, size=features, window=5, min_count=1, workers=4)
model.train(list_of_tweets, total_examples=len(list_of_tweets), epochs=epoc)
return model
def save_csv(fileName, test_y):
ids = np.arange(len(test_y))
with open(fileName, 'w') as csvfile:
tempwriter = csv.writer(csvfile)
tempwriter.writerow(["Id","Prediction"])
count = 0
for row in test_y:
if row == 0:
row = -1
tempwriter.writerow([(ids[count])+1,str(row)])
count = count + 1
def train(method, x, y, x_test):
met = method.fit(x,y)
test_y = met.predict(x_test)
return test_y
features = 350
epoc = 50
spos = open_file("train_pos.txt")
sneg = open_file("train_neg.txt")
spos = processTrainingData(spos)
sneg = processTrainingData(sneg)
y = [1]*len(spos)+[0]*len(sneg)
stotal = spos+sneg
model_tot = createWordEmbedding(stotal, features, epoc)
X = create_X(stotal,model_tot, features)
testd = open_file("test_data.txt")
testd = [gensim.utils.simple_preprocess(line) for line in testd]
model_test = createWordEmbedding(testd, features, epoc)
X_test = create_X(testd,model_test, features)
# Build logistic regression classifiers to identify the polarity of words
test_y = train(LogisticRegression(), X, y, X_test)
# Build naive bayes classifiers to identify the polarity of words
test_y_nb = train(nb.GaussianNB(), X, y, X_test) # this one isn't working
save_csv('test_resultLR.csv', test_y)
save_csv('test_resultNB.csv', test_y_nb)
What I primarily could use help in, is coming up with code that uses less memory, and an extra bonus would be faster code. I was thinking I could do some list comprehension, but I am unable to figure out a good way to do that.
model_tot.wv[str(word)]
gives a vector of length = features:
model_tot.wv[str(word)]
python machine-learning memory-optimization
python machine-learning memory-optimization
New contributor
New contributor
edited 3 hours ago
Jamal♦
30.2k11115226
30.2k11115226
New contributor
asked 8 hours ago
bamsebu
11
11
New contributor
New contributor
Hi Bamsebu, here at code review, we review working and complete. We aren't here to help you solve bugs with your code.
– Quill
7 hours ago
@Quill, Thanks for letting me know! I may have misunderstood the rules here or I have been a bit unclear, but my code is working. The problem is that it is slow and when the number of tweets approaches a million, my pc is in agony. If that's not the type of bugs that's meant for this site, i will search elsewhere.
– bamsebu
7 hours ago
Does it work okay for small inputs, but then exceeds time/memory limits for larger inputs? If so, that should be okay for this site. Refer to the discussion on this meta post
– Sᴀᴍ Onᴇᴌᴀ
58 mins ago
add a comment |
Hi Bamsebu, here at code review, we review working and complete. We aren't here to help you solve bugs with your code.
– Quill
7 hours ago
@Quill, Thanks for letting me know! I may have misunderstood the rules here or I have been a bit unclear, but my code is working. The problem is that it is slow and when the number of tweets approaches a million, my pc is in agony. If that's not the type of bugs that's meant for this site, i will search elsewhere.
– bamsebu
7 hours ago
Does it work okay for small inputs, but then exceeds time/memory limits for larger inputs? If so, that should be okay for this site. Refer to the discussion on this meta post
– Sᴀᴍ Onᴇᴌᴀ
58 mins ago
Hi Bamsebu, here at code review, we review working and complete. We aren't here to help you solve bugs with your code.
– Quill
7 hours ago
Hi Bamsebu, here at code review, we review working and complete. We aren't here to help you solve bugs with your code.
– Quill
7 hours ago
@Quill, Thanks for letting me know! I may have misunderstood the rules here or I have been a bit unclear, but my code is working. The problem is that it is slow and when the number of tweets approaches a million, my pc is in agony. If that's not the type of bugs that's meant for this site, i will search elsewhere.
– bamsebu
7 hours ago
@Quill, Thanks for letting me know! I may have misunderstood the rules here or I have been a bit unclear, but my code is working. The problem is that it is slow and when the number of tweets approaches a million, my pc is in agony. If that's not the type of bugs that's meant for this site, i will search elsewhere.
– bamsebu
7 hours ago
Does it work okay for small inputs, but then exceeds time/memory limits for larger inputs? If so, that should be okay for this site. Refer to the discussion on this meta post
– Sᴀᴍ Onᴇᴌᴀ
58 mins ago
Does it work okay for small inputs, but then exceeds time/memory limits for larger inputs? If so, that should be okay for this site. Refer to the discussion on this meta post
– Sᴀᴍ Onᴇᴌᴀ
58 mins ago
add a comment |
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
bamsebu is a new contributor. Be nice, and check out our Code of Conduct.
bamsebu is a new contributor. Be nice, and check out our Code of Conduct.
bamsebu is a new contributor. Be nice, and check out our Code of Conduct.
bamsebu is a new contributor. Be nice, and check out our Code of Conduct.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f208252%2fcalculate-the-average-word-vector-of-a-tweet-using-word2vec%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Hi Bamsebu, here at code review, we review working and complete. We aren't here to help you solve bugs with your code.
– Quill
7 hours ago
@Quill, Thanks for letting me know! I may have misunderstood the rules here or I have been a bit unclear, but my code is working. The problem is that it is slow and when the number of tweets approaches a million, my pc is in agony. If that's not the type of bugs that's meant for this site, i will search elsewhere.
– bamsebu
7 hours ago
Does it work okay for small inputs, but then exceeds time/memory limits for larger inputs? If so, that should be okay for this site. Refer to the discussion on this meta post
– Sᴀᴍ Onᴇᴌᴀ
58 mins ago