ValueError: empty vocabulary
I'm new to Python and trying to create a text classification program as part of a piece of work for school..
Using the following code and various (unedited) libraries including NumPy, scikit-learn and others, I keep getting the same error:
Traceback (most recent call last):
File "C:/Users/esg1/Python/Learning Python/stackabuse.com example/MediaBiasDetectionClassification.py", line 49, in <module>
X = vectorizer.fit_transform(documents).toarray()
File "C:Usersesg1Pythonlibsite-packagessklearnfeature_extractiontext.py", line 1010, in fit_transform
vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
File "C:Usersesg1Pythonlibsite-packagessklearnfeature_extractiontext.py", line 941, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only contain stop words")
ValueError: empty vocabulary; perhaps the documents only contain stop words
The code I'm working from is:
#importing libraries
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
import pickle
from nltk.corpus import stopwords
#importing the dataset
mediaBias_data = load_files(r"C:Usersesg1DesktopCourseYear 3Individual ProjectData Gathering")
X, y = mediaBias_data.data, mediaBias_data.target
#text preprocessing
documents =
for sen in range(0, len(X)):
# Remove all the special characters
document = re.sub(r'W', ' ', str(X[sen]))
# remove all single characters
document = re.sub(r's+[a-zA-Z]s+', ' ', document)
# Remove single characters from the start
document = re.sub(r'^[a-zA-Z]s+', ' ', document)
# Substituting multiple spaces with single space
document = re.sub(r's+', ' ', document, flags=re.I)
# Removing prefixed 'b'
document = re.sub(r'^bs+', '', document)
# Converting to Lowercase
document = document.lower()
# Lemmatization
document = document.split()
document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)
documents.append(document)
#converting text to numbers
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
#finding Term Frequency Inverse Document Frequency (TFIDF)
#TF
#TermFrequency = (Number of Occurrences of a word)/(Total words in the document)
#IDF
#IDF(word) = Log((Total number of documents)/(Number of documents containing the word))
#TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
#training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#training test classification model and predicting sentiment
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
#predicting
y_pred = classifier.predict(X_test)
#evaluating the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
#saving and loading the model
#save
with open('text_classifier', 'wb') as picklefile:
pickle.dump(classifier,picklefile)
#load
with open('text_classifier', 'rb') as training_model:
model = pickle.load(training_model)
#We loaded our trained model and stored it in the model variable.
#Let's predict the sentiment for the test set using our loaded model and see if we can get the same results.
#Execute the following script: y_pred2 = model.predict(X_test)
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))
Any advice on how to overcome the error would be appreciated!
python python-3.x scikit-learn
add a comment |
I'm new to Python and trying to create a text classification program as part of a piece of work for school..
Using the following code and various (unedited) libraries including NumPy, scikit-learn and others, I keep getting the same error:
Traceback (most recent call last):
File "C:/Users/esg1/Python/Learning Python/stackabuse.com example/MediaBiasDetectionClassification.py", line 49, in <module>
X = vectorizer.fit_transform(documents).toarray()
File "C:Usersesg1Pythonlibsite-packagessklearnfeature_extractiontext.py", line 1010, in fit_transform
vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
File "C:Usersesg1Pythonlibsite-packagessklearnfeature_extractiontext.py", line 941, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only contain stop words")
ValueError: empty vocabulary; perhaps the documents only contain stop words
The code I'm working from is:
#importing libraries
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
import pickle
from nltk.corpus import stopwords
#importing the dataset
mediaBias_data = load_files(r"C:Usersesg1DesktopCourseYear 3Individual ProjectData Gathering")
X, y = mediaBias_data.data, mediaBias_data.target
#text preprocessing
documents =
for sen in range(0, len(X)):
# Remove all the special characters
document = re.sub(r'W', ' ', str(X[sen]))
# remove all single characters
document = re.sub(r's+[a-zA-Z]s+', ' ', document)
# Remove single characters from the start
document = re.sub(r'^[a-zA-Z]s+', ' ', document)
# Substituting multiple spaces with single space
document = re.sub(r's+', ' ', document, flags=re.I)
# Removing prefixed 'b'
document = re.sub(r'^bs+', '', document)
# Converting to Lowercase
document = document.lower()
# Lemmatization
document = document.split()
document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)
documents.append(document)
#converting text to numbers
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
#finding Term Frequency Inverse Document Frequency (TFIDF)
#TF
#TermFrequency = (Number of Occurrences of a word)/(Total words in the document)
#IDF
#IDF(word) = Log((Total number of documents)/(Number of documents containing the word))
#TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
#training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#training test classification model and predicting sentiment
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
#predicting
y_pred = classifier.predict(X_test)
#evaluating the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
#saving and loading the model
#save
with open('text_classifier', 'wb') as picklefile:
pickle.dump(classifier,picklefile)
#load
with open('text_classifier', 'rb') as training_model:
model = pickle.load(training_model)
#We loaded our trained model and stored it in the model variable.
#Let's predict the sentiment for the test set using our loaded model and see if we can get the same results.
#Execute the following script: y_pred2 = model.predict(X_test)
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))
Any advice on how to overcome the error would be appreciated!
python python-3.x scikit-learn
After all the preprocessing you do on yourdocument
, either you have only the stop-words remaining in them or they are empty. Try printing thedocument
before appending them todocuments
.
– Vivek Kumar
Nov 26 '18 at 14:03
Thank you. I've tried that and nothing is returned. Even reducing the amount of pre-processing I do (to nothing) doesn't help - still nothing is printed. I know that there is text in the .txt files. Would it help if everything was in one text file rather than individual files? Thanks in advance!
– esg1
Dec 4 '18 at 12:59
add a comment |
I'm new to Python and trying to create a text classification program as part of a piece of work for school..
Using the following code and various (unedited) libraries including NumPy, scikit-learn and others, I keep getting the same error:
Traceback (most recent call last):
File "C:/Users/esg1/Python/Learning Python/stackabuse.com example/MediaBiasDetectionClassification.py", line 49, in <module>
X = vectorizer.fit_transform(documents).toarray()
File "C:Usersesg1Pythonlibsite-packagessklearnfeature_extractiontext.py", line 1010, in fit_transform
vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
File "C:Usersesg1Pythonlibsite-packagessklearnfeature_extractiontext.py", line 941, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only contain stop words")
ValueError: empty vocabulary; perhaps the documents only contain stop words
The code I'm working from is:
#importing libraries
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
import pickle
from nltk.corpus import stopwords
#importing the dataset
mediaBias_data = load_files(r"C:Usersesg1DesktopCourseYear 3Individual ProjectData Gathering")
X, y = mediaBias_data.data, mediaBias_data.target
#text preprocessing
documents =
for sen in range(0, len(X)):
# Remove all the special characters
document = re.sub(r'W', ' ', str(X[sen]))
# remove all single characters
document = re.sub(r's+[a-zA-Z]s+', ' ', document)
# Remove single characters from the start
document = re.sub(r'^[a-zA-Z]s+', ' ', document)
# Substituting multiple spaces with single space
document = re.sub(r's+', ' ', document, flags=re.I)
# Removing prefixed 'b'
document = re.sub(r'^bs+', '', document)
# Converting to Lowercase
document = document.lower()
# Lemmatization
document = document.split()
document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)
documents.append(document)
#converting text to numbers
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
#finding Term Frequency Inverse Document Frequency (TFIDF)
#TF
#TermFrequency = (Number of Occurrences of a word)/(Total words in the document)
#IDF
#IDF(word) = Log((Total number of documents)/(Number of documents containing the word))
#TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
#training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#training test classification model and predicting sentiment
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
#predicting
y_pred = classifier.predict(X_test)
#evaluating the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
#saving and loading the model
#save
with open('text_classifier', 'wb') as picklefile:
pickle.dump(classifier,picklefile)
#load
with open('text_classifier', 'rb') as training_model:
model = pickle.load(training_model)
#We loaded our trained model and stored it in the model variable.
#Let's predict the sentiment for the test set using our loaded model and see if we can get the same results.
#Execute the following script: y_pred2 = model.predict(X_test)
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))
Any advice on how to overcome the error would be appreciated!
python python-3.x scikit-learn
I'm new to Python and trying to create a text classification program as part of a piece of work for school..
Using the following code and various (unedited) libraries including NumPy, scikit-learn and others, I keep getting the same error:
Traceback (most recent call last):
File "C:/Users/esg1/Python/Learning Python/stackabuse.com example/MediaBiasDetectionClassification.py", line 49, in <module>
X = vectorizer.fit_transform(documents).toarray()
File "C:Usersesg1Pythonlibsite-packagessklearnfeature_extractiontext.py", line 1010, in fit_transform
vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
File "C:Usersesg1Pythonlibsite-packagessklearnfeature_extractiontext.py", line 941, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only contain stop words")
ValueError: empty vocabulary; perhaps the documents only contain stop words
The code I'm working from is:
#importing libraries
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
import pickle
from nltk.corpus import stopwords
#importing the dataset
mediaBias_data = load_files(r"C:Usersesg1DesktopCourseYear 3Individual ProjectData Gathering")
X, y = mediaBias_data.data, mediaBias_data.target
#text preprocessing
documents =
for sen in range(0, len(X)):
# Remove all the special characters
document = re.sub(r'W', ' ', str(X[sen]))
# remove all single characters
document = re.sub(r's+[a-zA-Z]s+', ' ', document)
# Remove single characters from the start
document = re.sub(r'^[a-zA-Z]s+', ' ', document)
# Substituting multiple spaces with single space
document = re.sub(r's+', ' ', document, flags=re.I)
# Removing prefixed 'b'
document = re.sub(r'^bs+', '', document)
# Converting to Lowercase
document = document.lower()
# Lemmatization
document = document.split()
document = [stemmer.lemmatize(word) for word in document]
document = ' '.join(document)
documents.append(document)
#converting text to numbers
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
#finding Term Frequency Inverse Document Frequency (TFIDF)
#TF
#TermFrequency = (Number of Occurrences of a word)/(Total words in the document)
#IDF
#IDF(word) = Log((Total number of documents)/(Number of documents containing the word))
#TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
#training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#training test classification model and predicting sentiment
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)
#predicting
y_pred = classifier.predict(X_test)
#evaluating the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
#saving and loading the model
#save
with open('text_classifier', 'wb') as picklefile:
pickle.dump(classifier,picklefile)
#load
with open('text_classifier', 'rb') as training_model:
model = pickle.load(training_model)
#We loaded our trained model and stored it in the model variable.
#Let's predict the sentiment for the test set using our loaded model and see if we can get the same results.
#Execute the following script: y_pred2 = model.predict(X_test)
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))
Any advice on how to overcome the error would be appreciated!
python python-3.x scikit-learn
python python-3.x scikit-learn
edited Nov 26 '18 at 7:28
Vivek Kumar
16.6k42156
16.6k42156
asked Nov 25 '18 at 22:05
esg1esg1
184
184
After all the preprocessing you do on yourdocument
, either you have only the stop-words remaining in them or they are empty. Try printing thedocument
before appending them todocuments
.
– Vivek Kumar
Nov 26 '18 at 14:03
Thank you. I've tried that and nothing is returned. Even reducing the amount of pre-processing I do (to nothing) doesn't help - still nothing is printed. I know that there is text in the .txt files. Would it help if everything was in one text file rather than individual files? Thanks in advance!
– esg1
Dec 4 '18 at 12:59
add a comment |
After all the preprocessing you do on yourdocument
, either you have only the stop-words remaining in them or they are empty. Try printing thedocument
before appending them todocuments
.
– Vivek Kumar
Nov 26 '18 at 14:03
Thank you. I've tried that and nothing is returned. Even reducing the amount of pre-processing I do (to nothing) doesn't help - still nothing is printed. I know that there is text in the .txt files. Would it help if everything was in one text file rather than individual files? Thanks in advance!
– esg1
Dec 4 '18 at 12:59
After all the preprocessing you do on your
document
, either you have only the stop-words remaining in them or they are empty. Try printing the document
before appending them to documents
.– Vivek Kumar
Nov 26 '18 at 14:03
After all the preprocessing you do on your
document
, either you have only the stop-words remaining in them or they are empty. Try printing the document
before appending them to documents
.– Vivek Kumar
Nov 26 '18 at 14:03
Thank you. I've tried that and nothing is returned. Even reducing the amount of pre-processing I do (to nothing) doesn't help - still nothing is printed. I know that there is text in the .txt files. Would it help if everything was in one text file rather than individual files? Thanks in advance!
– esg1
Dec 4 '18 at 12:59
Thank you. I've tried that and nothing is returned. Even reducing the amount of pre-processing I do (to nothing) doesn't help - still nothing is printed. I know that there is text in the .txt files. Would it help if everything was in one text file rather than individual files? Thanks in advance!
– esg1
Dec 4 '18 at 12:59
add a comment |
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53472477%2fvalueerror-empty-vocabulary%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53472477%2fvalueerror-empty-vocabulary%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
After all the preprocessing you do on your
document
, either you have only the stop-words remaining in them or they are empty. Try printing thedocument
before appending them todocuments
.– Vivek Kumar
Nov 26 '18 at 14:03
Thank you. I've tried that and nothing is returned. Even reducing the amount of pre-processing I do (to nothing) doesn't help - still nothing is printed. I know that there is text in the .txt files. Would it help if everything was in one text file rather than individual files? Thanks in advance!
– esg1
Dec 4 '18 at 12:59