TypeError when running Concurrent Futures inside a Class
I'm building a class that allows me to mine a set of links off a site, and then i use concurrent.futures in order to check the validity of the links.
When I run the function multithreaded_link_checking
, it runs fine.
However, when its inside a class, it returns this error:
Traceback (most recent call last):
File "datahandler.py", line 236, in <module>
data.multithreaded_link_checking(links)
File "datahandler.py", line 209, in multithreaded_link_checking
with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
File "/anaconda3/lib/python3.7/concurrent/futures/thread.py", line 128, in __init__
if max_workers <= 0:
TypeError: '<=' not supported between instances of 'list' and 'int'
My code is below:
class DataHandler:
def __init__(self, url, file=None,):
self.file = file
self.url = url
def get_links_a_on_page(self):
"""
Gets a links from the igm website
returns list -> links
"""
# TODO: Check if URL is correct
print('checking site')
site = requests.get(self.url)
soup = BeautifulSoup(site.text, 'html.parser')
print("URL received, cleaning links.")
# Find all the href on self.url
links = [a_link['href'] for a_link in soup.find_all("a", href=True)]
for n, i in enumerate(links):
clean_link = re.search("http:(.*)", i)
links[n] = clean_link.group(0)
print("Cleaning URL")
return links
def get_links_from_csv(self):
"""
:return list -> links:
"""
# TODO: Check if File is CSV
# TODO: Check if File has links
links =
try:
with open(self.file, newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
for row in spamreader:
links.append(', '.join(row))
except FileNotFoundError:
print("File not found")
return links
def check_links_urllib3_helper(link, return_links=True):
"""
Checks the response code of the url
:param site_url:
:return response code:
"""
if return_links is True:
# Initialize urllib manager -- highly efficient
http = urllib3.PoolManager()
# HEAD to get header values -- much faster
r = http.request("HEAD", link)
if r.status == 200:
return link
if return_links is False:
# Initialize urllib manager -- highly efficient
http = urllib3.PoolManager()
# HEAD to get header values -- much faster
r = http.request("HEAD", link)
return r.status
def multithreaded_link_checking(links, max_workers=99):
"""
multithreaded operation to review a set of links and identifies working links vs. 404 codes
:param links:
:return None:
"""
# codes list in order to log the response
downloadable_links =
# Use the ThreadPoolExecutor to run concurrent processes
with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
# set the size to know how many links are left -- do this to make sure the threads dont bug out
size = len(links)
# map(function,iterable)
for i in executor.map(check_links_urllib3_helper, links):
print("Links Left: ", size)
downloadable_links.append(i)
size -= 1
return downloadable_links
python python-3.x multithreading concurrency
add a comment |
I'm building a class that allows me to mine a set of links off a site, and then i use concurrent.futures in order to check the validity of the links.
When I run the function multithreaded_link_checking
, it runs fine.
However, when its inside a class, it returns this error:
Traceback (most recent call last):
File "datahandler.py", line 236, in <module>
data.multithreaded_link_checking(links)
File "datahandler.py", line 209, in multithreaded_link_checking
with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
File "/anaconda3/lib/python3.7/concurrent/futures/thread.py", line 128, in __init__
if max_workers <= 0:
TypeError: '<=' not supported between instances of 'list' and 'int'
My code is below:
class DataHandler:
def __init__(self, url, file=None,):
self.file = file
self.url = url
def get_links_a_on_page(self):
"""
Gets a links from the igm website
returns list -> links
"""
# TODO: Check if URL is correct
print('checking site')
site = requests.get(self.url)
soup = BeautifulSoup(site.text, 'html.parser')
print("URL received, cleaning links.")
# Find all the href on self.url
links = [a_link['href'] for a_link in soup.find_all("a", href=True)]
for n, i in enumerate(links):
clean_link = re.search("http:(.*)", i)
links[n] = clean_link.group(0)
print("Cleaning URL")
return links
def get_links_from_csv(self):
"""
:return list -> links:
"""
# TODO: Check if File is CSV
# TODO: Check if File has links
links =
try:
with open(self.file, newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
for row in spamreader:
links.append(', '.join(row))
except FileNotFoundError:
print("File not found")
return links
def check_links_urllib3_helper(link, return_links=True):
"""
Checks the response code of the url
:param site_url:
:return response code:
"""
if return_links is True:
# Initialize urllib manager -- highly efficient
http = urllib3.PoolManager()
# HEAD to get header values -- much faster
r = http.request("HEAD", link)
if r.status == 200:
return link
if return_links is False:
# Initialize urllib manager -- highly efficient
http = urllib3.PoolManager()
# HEAD to get header values -- much faster
r = http.request("HEAD", link)
return r.status
def multithreaded_link_checking(links, max_workers=99):
"""
multithreaded operation to review a set of links and identifies working links vs. 404 codes
:param links:
:return None:
"""
# codes list in order to log the response
downloadable_links =
# Use the ThreadPoolExecutor to run concurrent processes
with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
# set the size to know how many links are left -- do this to make sure the threads dont bug out
size = len(links)
# map(function,iterable)
for i in executor.map(check_links_urllib3_helper, links):
print("Links Left: ", size)
downloadable_links.append(i)
size -= 1
return downloadable_links
python python-3.x multithreading concurrency
add a comment |
I'm building a class that allows me to mine a set of links off a site, and then i use concurrent.futures in order to check the validity of the links.
When I run the function multithreaded_link_checking
, it runs fine.
However, when its inside a class, it returns this error:
Traceback (most recent call last):
File "datahandler.py", line 236, in <module>
data.multithreaded_link_checking(links)
File "datahandler.py", line 209, in multithreaded_link_checking
with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
File "/anaconda3/lib/python3.7/concurrent/futures/thread.py", line 128, in __init__
if max_workers <= 0:
TypeError: '<=' not supported between instances of 'list' and 'int'
My code is below:
class DataHandler:
def __init__(self, url, file=None,):
self.file = file
self.url = url
def get_links_a_on_page(self):
"""
Gets a links from the igm website
returns list -> links
"""
# TODO: Check if URL is correct
print('checking site')
site = requests.get(self.url)
soup = BeautifulSoup(site.text, 'html.parser')
print("URL received, cleaning links.")
# Find all the href on self.url
links = [a_link['href'] for a_link in soup.find_all("a", href=True)]
for n, i in enumerate(links):
clean_link = re.search("http:(.*)", i)
links[n] = clean_link.group(0)
print("Cleaning URL")
return links
def get_links_from_csv(self):
"""
:return list -> links:
"""
# TODO: Check if File is CSV
# TODO: Check if File has links
links =
try:
with open(self.file, newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
for row in spamreader:
links.append(', '.join(row))
except FileNotFoundError:
print("File not found")
return links
def check_links_urllib3_helper(link, return_links=True):
"""
Checks the response code of the url
:param site_url:
:return response code:
"""
if return_links is True:
# Initialize urllib manager -- highly efficient
http = urllib3.PoolManager()
# HEAD to get header values -- much faster
r = http.request("HEAD", link)
if r.status == 200:
return link
if return_links is False:
# Initialize urllib manager -- highly efficient
http = urllib3.PoolManager()
# HEAD to get header values -- much faster
r = http.request("HEAD", link)
return r.status
def multithreaded_link_checking(links, max_workers=99):
"""
multithreaded operation to review a set of links and identifies working links vs. 404 codes
:param links:
:return None:
"""
# codes list in order to log the response
downloadable_links =
# Use the ThreadPoolExecutor to run concurrent processes
with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
# set the size to know how many links are left -- do this to make sure the threads dont bug out
size = len(links)
# map(function,iterable)
for i in executor.map(check_links_urllib3_helper, links):
print("Links Left: ", size)
downloadable_links.append(i)
size -= 1
return downloadable_links
python python-3.x multithreading concurrency
I'm building a class that allows me to mine a set of links off a site, and then i use concurrent.futures in order to check the validity of the links.
When I run the function multithreaded_link_checking
, it runs fine.
However, when its inside a class, it returns this error:
Traceback (most recent call last):
File "datahandler.py", line 236, in <module>
data.multithreaded_link_checking(links)
File "datahandler.py", line 209, in multithreaded_link_checking
with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
File "/anaconda3/lib/python3.7/concurrent/futures/thread.py", line 128, in __init__
if max_workers <= 0:
TypeError: '<=' not supported between instances of 'list' and 'int'
My code is below:
class DataHandler:
def __init__(self, url, file=None,):
self.file = file
self.url = url
def get_links_a_on_page(self):
"""
Gets a links from the igm website
returns list -> links
"""
# TODO: Check if URL is correct
print('checking site')
site = requests.get(self.url)
soup = BeautifulSoup(site.text, 'html.parser')
print("URL received, cleaning links.")
# Find all the href on self.url
links = [a_link['href'] for a_link in soup.find_all("a", href=True)]
for n, i in enumerate(links):
clean_link = re.search("http:(.*)", i)
links[n] = clean_link.group(0)
print("Cleaning URL")
return links
def get_links_from_csv(self):
"""
:return list -> links:
"""
# TODO: Check if File is CSV
# TODO: Check if File has links
links =
try:
with open(self.file, newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
for row in spamreader:
links.append(', '.join(row))
except FileNotFoundError:
print("File not found")
return links
def check_links_urllib3_helper(link, return_links=True):
"""
Checks the response code of the url
:param site_url:
:return response code:
"""
if return_links is True:
# Initialize urllib manager -- highly efficient
http = urllib3.PoolManager()
# HEAD to get header values -- much faster
r = http.request("HEAD", link)
if r.status == 200:
return link
if return_links is False:
# Initialize urllib manager -- highly efficient
http = urllib3.PoolManager()
# HEAD to get header values -- much faster
r = http.request("HEAD", link)
return r.status
def multithreaded_link_checking(links, max_workers=99):
"""
multithreaded operation to review a set of links and identifies working links vs. 404 codes
:param links:
:return None:
"""
# codes list in order to log the response
downloadable_links =
# Use the ThreadPoolExecutor to run concurrent processes
with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
# set the size to know how many links are left -- do this to make sure the threads dont bug out
size = len(links)
# map(function,iterable)
for i in executor.map(check_links_urllib3_helper, links):
print("Links Left: ", size)
downloadable_links.append(i)
size -= 1
return downloadable_links
python python-3.x multithreading concurrency
python python-3.x multithreading concurrency
asked Nov 23 '18 at 2:09
Rizwan QaiserRizwan Qaiser
11
11
add a comment |
add a comment |
1 Answer
1
active
oldest
votes
fixed.
Needed to add self
as a positional argument to multithreaded_link_checking()
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53439878%2ftypeerror-when-running-concurrent-futures-inside-a-class%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
fixed.
Needed to add self
as a positional argument to multithreaded_link_checking()
add a comment |
fixed.
Needed to add self
as a positional argument to multithreaded_link_checking()
add a comment |
fixed.
Needed to add self
as a positional argument to multithreaded_link_checking()
fixed.
Needed to add self
as a positional argument to multithreaded_link_checking()
answered Nov 23 '18 at 2:19
Rizwan QaiserRizwan Qaiser
11
11
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53439878%2ftypeerror-when-running-concurrent-futures-inside-a-class%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown