Scraper to get property info from Tripadvisor
$begingroup$
I'm very much a beginner at programming, and hoping to get some advice!
I'm having some trouble with a tripadvisor scraper being slow, and have identified the part of my code that is taking a while. It's likely because of the long selector, but i'm not sure how to use anything more specific because there are randomly generated strings on the more specific selectors. Below the snippet that is taking awhile, and below that is the full code. Would appreciate any feedback!
Sample of the webpages im scraping:
https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html
https://www.tripadvisor.com.sg/Hotel_Review-g255100-d257433-Reviews-The_Hotel_Windsor-Melbourne_Victoria.html
Code giving me problems:
num_rooms = 0
extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
for data in extra_info:
data = data.text.strip()
if data.isdigit():
num_rooms = int(data)
Full code:
import requests
from bs4 import BeautifulSoup
import xlsxwriter
import time
def get_soup(url):
r = requests.get(url)
return BeautifulSoup(r.content, 'html.parser')
def write_xlsx(items, xlsx_write_row):
write_column = 0
for item in items:
worksheet.write(xlsx_write_row, write_column, item)
write_column += 1
workbook = xlsxwriter.Workbook('Results.xlsx')
worksheet = workbook.add_worksheet()
# user variables
while True:
start_url = input('Start url: ')
if 'https://www.tripadvisor.com.sg/Hotels-' not in start_url:
print(
'Please enter a valid url. e.g https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html')
else:
break
print('fetching page...')
soup = get_soup(start_url)
while True:
min_rev_num = input('Min Reviews for property: ')
if min_rev_num.isdigit():
if int(min_rev_num) >= 0:
min_rev_num = int(min_rev_num)
break
print('Please enter a valid number')
while True:
print('Enter max number of low review number properties on a single page, from 0 to 30.')
print('(Program will exit once this condition is fulfilled)')
num_rev_criteria = input('Input: ')
if num_rev_criteria.isdigit():
if 0 <= int(num_rev_criteria) <= 30:
num_rev_criteria = int(num_rev_criteria)
break
print('Please enter a valid number')
while True:
min_star_rating = input('Min star rating for property: ')
if min_star_rating.isdigit():
if 0 <= int(min_star_rating) <= 5:
min_star_rating = float(min_star_rating)
break
print('Please enter a valid number')
while True:
min_room_num = input('Min number of rooms: ')
if min_room_num.isdigit():
if int(min_room_num) >= 0:
min_room_num = int(min_room_num)
break
print('Please enter a valid number')
while True:
max_num_pages = int(soup.select_one('.pageNum.last.taLnk').text.strip())
num_pages = input('Page to search until(1 to {}):'.format(str(max_num_pages)))
if num_pages.isdigit():
if 1 <= int(num_pages) <= max_num_pages:
num_pages = int(num_pages)
break
print('Please enter a valid number')
print('-'*30 + 'n')
check = input("Make sure 'Results.xlsx' is closed and deleted. Once you are ready, press enter")
write_row = 0
write_xlsx(['Property Details', 'Star Rating', 'Number of Rooms'], write_row)
page_url = start_url
rejected_properties = 0
start = time.time()
print('Getting data...')
# get property data
for page_num in range(num_pages):
print('nOn page {}n'.format(str(page_num + 1)))
low_review_count = 0
soup = get_soup(page_url)
if page_num != num_pages - 1:
next_page = soup.select_one('.nav.next.taLnk.ui_button.primary')['href']
page_url = 'https://www.tripadvisor.com.sg' + next_page
else:
pass
rows = soup.select('.property_title.prominent')
prop_urls =
for row in rows:
prop_urls.append('https://www.tripadvisor.com.sg' + row['href'])
for prop in prop_urls:
soup = get_soup(prop)
try:
num_reviews = int(soup.select_one('.reviewCount').text.strip().split(' ')[0].replace(',', ''))
except AttributeError:
num_reviews = 0
try:
property_name = soup.select_one('#HEADING').text.strip()
except AttributeError:
property_name = ' '
if num_reviews >= min_rev_num:
try:
star_rating_class = soup.select_one('.ui_star_rating')['class'][1]
star_rating = float(star_rating_class[5] + '.' + star_rating_class[6])
except TypeError:
star_rating = 0
num_rooms = 0
extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
for data in extra_info:
data = data.text.strip()
if data.isdigit():
num_rooms = int(data)
try:
address = soup.select_one('.street-address').text.strip() + ', ' + soup.select_one('.locality').text.strip() + soup.select_one('.country-name').text.strip()
except AttributeError:
address = ' '
try:
phone = soup.select_one('.is-hidden-mobile.detail').text.strip()
except AttributeError:
phone = ' '
if star_rating >= min_star_rating or star_rating == 0:
if num_rooms >= min_room_num or num_rooms == 0:
write_row += 1
write_xlsx([property_name + 'n' + address + 'nT: ' + phone, star_rating, num_rooms], write_row)
else:
print("Rejected: '{}'n".format(property_name) + ' - Not enough rooms:{}'.format(num_rooms))
else:
print("Rejected: '{}'n".format(property_name)+' - Not high enough star rating:{}'.format(star_rating))
else:
low_review_count += 1
print("Rejected: '{}'n".format(property_name) + ' - Not enough reviews:{}'.format(num_reviews))
print(' - Low review count: {}/{}'.format(low_review_count, num_rev_criteria))
if low_review_count >= num_rev_criteria:
print('Exiting due to low review count on page')
break
workbook.close()
end = time.time()
print("nDone! Results can be found in 'Results.xlsx' in the same foldern")
print('Results can be copied straight onto the shortlist(paste values only), formatting has already been done.')
print('If any results have 0 stars or 0 rooms, Tripadvisor does not have this data')
print('Address and phone numbers are based on Tripadvisor data as welln')
print('Number of pages searched: {}'.format(str(page_num + 1)))
props_searched = (page_num - 1)*30 + len(prop_urls)
print('Number of properties searched: {}'.format(str(props_searched)))
print('Number of properties accepted: {}'.format(str(write_row - 1)))
print('Number of properties rejected: {}'.format(str(props_searched - write_row + 1)))
print('Time taken: {} minutes'.format(str((end-start)//60)))
while True:
check = input('nTo exit, press enter')
if True:
break
python web-scraping
New contributor
$endgroup$
add a comment |
$begingroup$
I'm very much a beginner at programming, and hoping to get some advice!
I'm having some trouble with a tripadvisor scraper being slow, and have identified the part of my code that is taking a while. It's likely because of the long selector, but i'm not sure how to use anything more specific because there are randomly generated strings on the more specific selectors. Below the snippet that is taking awhile, and below that is the full code. Would appreciate any feedback!
Sample of the webpages im scraping:
https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html
https://www.tripadvisor.com.sg/Hotel_Review-g255100-d257433-Reviews-The_Hotel_Windsor-Melbourne_Victoria.html
Code giving me problems:
num_rooms = 0
extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
for data in extra_info:
data = data.text.strip()
if data.isdigit():
num_rooms = int(data)
Full code:
import requests
from bs4 import BeautifulSoup
import xlsxwriter
import time
def get_soup(url):
r = requests.get(url)
return BeautifulSoup(r.content, 'html.parser')
def write_xlsx(items, xlsx_write_row):
write_column = 0
for item in items:
worksheet.write(xlsx_write_row, write_column, item)
write_column += 1
workbook = xlsxwriter.Workbook('Results.xlsx')
worksheet = workbook.add_worksheet()
# user variables
while True:
start_url = input('Start url: ')
if 'https://www.tripadvisor.com.sg/Hotels-' not in start_url:
print(
'Please enter a valid url. e.g https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html')
else:
break
print('fetching page...')
soup = get_soup(start_url)
while True:
min_rev_num = input('Min Reviews for property: ')
if min_rev_num.isdigit():
if int(min_rev_num) >= 0:
min_rev_num = int(min_rev_num)
break
print('Please enter a valid number')
while True:
print('Enter max number of low review number properties on a single page, from 0 to 30.')
print('(Program will exit once this condition is fulfilled)')
num_rev_criteria = input('Input: ')
if num_rev_criteria.isdigit():
if 0 <= int(num_rev_criteria) <= 30:
num_rev_criteria = int(num_rev_criteria)
break
print('Please enter a valid number')
while True:
min_star_rating = input('Min star rating for property: ')
if min_star_rating.isdigit():
if 0 <= int(min_star_rating) <= 5:
min_star_rating = float(min_star_rating)
break
print('Please enter a valid number')
while True:
min_room_num = input('Min number of rooms: ')
if min_room_num.isdigit():
if int(min_room_num) >= 0:
min_room_num = int(min_room_num)
break
print('Please enter a valid number')
while True:
max_num_pages = int(soup.select_one('.pageNum.last.taLnk').text.strip())
num_pages = input('Page to search until(1 to {}):'.format(str(max_num_pages)))
if num_pages.isdigit():
if 1 <= int(num_pages) <= max_num_pages:
num_pages = int(num_pages)
break
print('Please enter a valid number')
print('-'*30 + 'n')
check = input("Make sure 'Results.xlsx' is closed and deleted. Once you are ready, press enter")
write_row = 0
write_xlsx(['Property Details', 'Star Rating', 'Number of Rooms'], write_row)
page_url = start_url
rejected_properties = 0
start = time.time()
print('Getting data...')
# get property data
for page_num in range(num_pages):
print('nOn page {}n'.format(str(page_num + 1)))
low_review_count = 0
soup = get_soup(page_url)
if page_num != num_pages - 1:
next_page = soup.select_one('.nav.next.taLnk.ui_button.primary')['href']
page_url = 'https://www.tripadvisor.com.sg' + next_page
else:
pass
rows = soup.select('.property_title.prominent')
prop_urls =
for row in rows:
prop_urls.append('https://www.tripadvisor.com.sg' + row['href'])
for prop in prop_urls:
soup = get_soup(prop)
try:
num_reviews = int(soup.select_one('.reviewCount').text.strip().split(' ')[0].replace(',', ''))
except AttributeError:
num_reviews = 0
try:
property_name = soup.select_one('#HEADING').text.strip()
except AttributeError:
property_name = ' '
if num_reviews >= min_rev_num:
try:
star_rating_class = soup.select_one('.ui_star_rating')['class'][1]
star_rating = float(star_rating_class[5] + '.' + star_rating_class[6])
except TypeError:
star_rating = 0
num_rooms = 0
extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
for data in extra_info:
data = data.text.strip()
if data.isdigit():
num_rooms = int(data)
try:
address = soup.select_one('.street-address').text.strip() + ', ' + soup.select_one('.locality').text.strip() + soup.select_one('.country-name').text.strip()
except AttributeError:
address = ' '
try:
phone = soup.select_one('.is-hidden-mobile.detail').text.strip()
except AttributeError:
phone = ' '
if star_rating >= min_star_rating or star_rating == 0:
if num_rooms >= min_room_num or num_rooms == 0:
write_row += 1
write_xlsx([property_name + 'n' + address + 'nT: ' + phone, star_rating, num_rooms], write_row)
else:
print("Rejected: '{}'n".format(property_name) + ' - Not enough rooms:{}'.format(num_rooms))
else:
print("Rejected: '{}'n".format(property_name)+' - Not high enough star rating:{}'.format(star_rating))
else:
low_review_count += 1
print("Rejected: '{}'n".format(property_name) + ' - Not enough reviews:{}'.format(num_reviews))
print(' - Low review count: {}/{}'.format(low_review_count, num_rev_criteria))
if low_review_count >= num_rev_criteria:
print('Exiting due to low review count on page')
break
workbook.close()
end = time.time()
print("nDone! Results can be found in 'Results.xlsx' in the same foldern")
print('Results can be copied straight onto the shortlist(paste values only), formatting has already been done.')
print('If any results have 0 stars or 0 rooms, Tripadvisor does not have this data')
print('Address and phone numbers are based on Tripadvisor data as welln')
print('Number of pages searched: {}'.format(str(page_num + 1)))
props_searched = (page_num - 1)*30 + len(prop_urls)
print('Number of properties searched: {}'.format(str(props_searched)))
print('Number of properties accepted: {}'.format(str(write_row - 1)))
print('Number of properties rejected: {}'.format(str(props_searched - write_row + 1)))
print('Time taken: {} minutes'.format(str((end-start)//60)))
while True:
check = input('nTo exit, press enter')
if True:
break
python web-scraping
New contributor
$endgroup$
add a comment |
$begingroup$
I'm very much a beginner at programming, and hoping to get some advice!
I'm having some trouble with a tripadvisor scraper being slow, and have identified the part of my code that is taking a while. It's likely because of the long selector, but i'm not sure how to use anything more specific because there are randomly generated strings on the more specific selectors. Below the snippet that is taking awhile, and below that is the full code. Would appreciate any feedback!
Sample of the webpages im scraping:
https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html
https://www.tripadvisor.com.sg/Hotel_Review-g255100-d257433-Reviews-The_Hotel_Windsor-Melbourne_Victoria.html
Code giving me problems:
num_rooms = 0
extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
for data in extra_info:
data = data.text.strip()
if data.isdigit():
num_rooms = int(data)
Full code:
import requests
from bs4 import BeautifulSoup
import xlsxwriter
import time
def get_soup(url):
r = requests.get(url)
return BeautifulSoup(r.content, 'html.parser')
def write_xlsx(items, xlsx_write_row):
write_column = 0
for item in items:
worksheet.write(xlsx_write_row, write_column, item)
write_column += 1
workbook = xlsxwriter.Workbook('Results.xlsx')
worksheet = workbook.add_worksheet()
# user variables
while True:
start_url = input('Start url: ')
if 'https://www.tripadvisor.com.sg/Hotels-' not in start_url:
print(
'Please enter a valid url. e.g https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html')
else:
break
print('fetching page...')
soup = get_soup(start_url)
while True:
min_rev_num = input('Min Reviews for property: ')
if min_rev_num.isdigit():
if int(min_rev_num) >= 0:
min_rev_num = int(min_rev_num)
break
print('Please enter a valid number')
while True:
print('Enter max number of low review number properties on a single page, from 0 to 30.')
print('(Program will exit once this condition is fulfilled)')
num_rev_criteria = input('Input: ')
if num_rev_criteria.isdigit():
if 0 <= int(num_rev_criteria) <= 30:
num_rev_criteria = int(num_rev_criteria)
break
print('Please enter a valid number')
while True:
min_star_rating = input('Min star rating for property: ')
if min_star_rating.isdigit():
if 0 <= int(min_star_rating) <= 5:
min_star_rating = float(min_star_rating)
break
print('Please enter a valid number')
while True:
min_room_num = input('Min number of rooms: ')
if min_room_num.isdigit():
if int(min_room_num) >= 0:
min_room_num = int(min_room_num)
break
print('Please enter a valid number')
while True:
max_num_pages = int(soup.select_one('.pageNum.last.taLnk').text.strip())
num_pages = input('Page to search until(1 to {}):'.format(str(max_num_pages)))
if num_pages.isdigit():
if 1 <= int(num_pages) <= max_num_pages:
num_pages = int(num_pages)
break
print('Please enter a valid number')
print('-'*30 + 'n')
check = input("Make sure 'Results.xlsx' is closed and deleted. Once you are ready, press enter")
write_row = 0
write_xlsx(['Property Details', 'Star Rating', 'Number of Rooms'], write_row)
page_url = start_url
rejected_properties = 0
start = time.time()
print('Getting data...')
# get property data
for page_num in range(num_pages):
print('nOn page {}n'.format(str(page_num + 1)))
low_review_count = 0
soup = get_soup(page_url)
if page_num != num_pages - 1:
next_page = soup.select_one('.nav.next.taLnk.ui_button.primary')['href']
page_url = 'https://www.tripadvisor.com.sg' + next_page
else:
pass
rows = soup.select('.property_title.prominent')
prop_urls =
for row in rows:
prop_urls.append('https://www.tripadvisor.com.sg' + row['href'])
for prop in prop_urls:
soup = get_soup(prop)
try:
num_reviews = int(soup.select_one('.reviewCount').text.strip().split(' ')[0].replace(',', ''))
except AttributeError:
num_reviews = 0
try:
property_name = soup.select_one('#HEADING').text.strip()
except AttributeError:
property_name = ' '
if num_reviews >= min_rev_num:
try:
star_rating_class = soup.select_one('.ui_star_rating')['class'][1]
star_rating = float(star_rating_class[5] + '.' + star_rating_class[6])
except TypeError:
star_rating = 0
num_rooms = 0
extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
for data in extra_info:
data = data.text.strip()
if data.isdigit():
num_rooms = int(data)
try:
address = soup.select_one('.street-address').text.strip() + ', ' + soup.select_one('.locality').text.strip() + soup.select_one('.country-name').text.strip()
except AttributeError:
address = ' '
try:
phone = soup.select_one('.is-hidden-mobile.detail').text.strip()
except AttributeError:
phone = ' '
if star_rating >= min_star_rating or star_rating == 0:
if num_rooms >= min_room_num or num_rooms == 0:
write_row += 1
write_xlsx([property_name + 'n' + address + 'nT: ' + phone, star_rating, num_rooms], write_row)
else:
print("Rejected: '{}'n".format(property_name) + ' - Not enough rooms:{}'.format(num_rooms))
else:
print("Rejected: '{}'n".format(property_name)+' - Not high enough star rating:{}'.format(star_rating))
else:
low_review_count += 1
print("Rejected: '{}'n".format(property_name) + ' - Not enough reviews:{}'.format(num_reviews))
print(' - Low review count: {}/{}'.format(low_review_count, num_rev_criteria))
if low_review_count >= num_rev_criteria:
print('Exiting due to low review count on page')
break
workbook.close()
end = time.time()
print("nDone! Results can be found in 'Results.xlsx' in the same foldern")
print('Results can be copied straight onto the shortlist(paste values only), formatting has already been done.')
print('If any results have 0 stars or 0 rooms, Tripadvisor does not have this data')
print('Address and phone numbers are based on Tripadvisor data as welln')
print('Number of pages searched: {}'.format(str(page_num + 1)))
props_searched = (page_num - 1)*30 + len(prop_urls)
print('Number of properties searched: {}'.format(str(props_searched)))
print('Number of properties accepted: {}'.format(str(write_row - 1)))
print('Number of properties rejected: {}'.format(str(props_searched - write_row + 1)))
print('Time taken: {} minutes'.format(str((end-start)//60)))
while True:
check = input('nTo exit, press enter')
if True:
break
python web-scraping
New contributor
$endgroup$
I'm very much a beginner at programming, and hoping to get some advice!
I'm having some trouble with a tripadvisor scraper being slow, and have identified the part of my code that is taking a while. It's likely because of the long selector, but i'm not sure how to use anything more specific because there are randomly generated strings on the more specific selectors. Below the snippet that is taking awhile, and below that is the full code. Would appreciate any feedback!
Sample of the webpages im scraping:
https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html
https://www.tripadvisor.com.sg/Hotel_Review-g255100-d257433-Reviews-The_Hotel_Windsor-Melbourne_Victoria.html
Code giving me problems:
num_rooms = 0
extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
for data in extra_info:
data = data.text.strip()
if data.isdigit():
num_rooms = int(data)
Full code:
import requests
from bs4 import BeautifulSoup
import xlsxwriter
import time
def get_soup(url):
r = requests.get(url)
return BeautifulSoup(r.content, 'html.parser')
def write_xlsx(items, xlsx_write_row):
write_column = 0
for item in items:
worksheet.write(xlsx_write_row, write_column, item)
write_column += 1
workbook = xlsxwriter.Workbook('Results.xlsx')
worksheet = workbook.add_worksheet()
# user variables
while True:
start_url = input('Start url: ')
if 'https://www.tripadvisor.com.sg/Hotels-' not in start_url:
print(
'Please enter a valid url. e.g https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html')
else:
break
print('fetching page...')
soup = get_soup(start_url)
while True:
min_rev_num = input('Min Reviews for property: ')
if min_rev_num.isdigit():
if int(min_rev_num) >= 0:
min_rev_num = int(min_rev_num)
break
print('Please enter a valid number')
while True:
print('Enter max number of low review number properties on a single page, from 0 to 30.')
print('(Program will exit once this condition is fulfilled)')
num_rev_criteria = input('Input: ')
if num_rev_criteria.isdigit():
if 0 <= int(num_rev_criteria) <= 30:
num_rev_criteria = int(num_rev_criteria)
break
print('Please enter a valid number')
while True:
min_star_rating = input('Min star rating for property: ')
if min_star_rating.isdigit():
if 0 <= int(min_star_rating) <= 5:
min_star_rating = float(min_star_rating)
break
print('Please enter a valid number')
while True:
min_room_num = input('Min number of rooms: ')
if min_room_num.isdigit():
if int(min_room_num) >= 0:
min_room_num = int(min_room_num)
break
print('Please enter a valid number')
while True:
max_num_pages = int(soup.select_one('.pageNum.last.taLnk').text.strip())
num_pages = input('Page to search until(1 to {}):'.format(str(max_num_pages)))
if num_pages.isdigit():
if 1 <= int(num_pages) <= max_num_pages:
num_pages = int(num_pages)
break
print('Please enter a valid number')
print('-'*30 + 'n')
check = input("Make sure 'Results.xlsx' is closed and deleted. Once you are ready, press enter")
write_row = 0
write_xlsx(['Property Details', 'Star Rating', 'Number of Rooms'], write_row)
page_url = start_url
rejected_properties = 0
start = time.time()
print('Getting data...')
# get property data
for page_num in range(num_pages):
print('nOn page {}n'.format(str(page_num + 1)))
low_review_count = 0
soup = get_soup(page_url)
if page_num != num_pages - 1:
next_page = soup.select_one('.nav.next.taLnk.ui_button.primary')['href']
page_url = 'https://www.tripadvisor.com.sg' + next_page
else:
pass
rows = soup.select('.property_title.prominent')
prop_urls =
for row in rows:
prop_urls.append('https://www.tripadvisor.com.sg' + row['href'])
for prop in prop_urls:
soup = get_soup(prop)
try:
num_reviews = int(soup.select_one('.reviewCount').text.strip().split(' ')[0].replace(',', ''))
except AttributeError:
num_reviews = 0
try:
property_name = soup.select_one('#HEADING').text.strip()
except AttributeError:
property_name = ' '
if num_reviews >= min_rev_num:
try:
star_rating_class = soup.select_one('.ui_star_rating')['class'][1]
star_rating = float(star_rating_class[5] + '.' + star_rating_class[6])
except TypeError:
star_rating = 0
num_rooms = 0
extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
for data in extra_info:
data = data.text.strip()
if data.isdigit():
num_rooms = int(data)
try:
address = soup.select_one('.street-address').text.strip() + ', ' + soup.select_one('.locality').text.strip() + soup.select_one('.country-name').text.strip()
except AttributeError:
address = ' '
try:
phone = soup.select_one('.is-hidden-mobile.detail').text.strip()
except AttributeError:
phone = ' '
if star_rating >= min_star_rating or star_rating == 0:
if num_rooms >= min_room_num or num_rooms == 0:
write_row += 1
write_xlsx([property_name + 'n' + address + 'nT: ' + phone, star_rating, num_rooms], write_row)
else:
print("Rejected: '{}'n".format(property_name) + ' - Not enough rooms:{}'.format(num_rooms))
else:
print("Rejected: '{}'n".format(property_name)+' - Not high enough star rating:{}'.format(star_rating))
else:
low_review_count += 1
print("Rejected: '{}'n".format(property_name) + ' - Not enough reviews:{}'.format(num_reviews))
print(' - Low review count: {}/{}'.format(low_review_count, num_rev_criteria))
if low_review_count >= num_rev_criteria:
print('Exiting due to low review count on page')
break
workbook.close()
end = time.time()
print("nDone! Results can be found in 'Results.xlsx' in the same foldern")
print('Results can be copied straight onto the shortlist(paste values only), formatting has already been done.')
print('If any results have 0 stars or 0 rooms, Tripadvisor does not have this data')
print('Address and phone numbers are based on Tripadvisor data as welln')
print('Number of pages searched: {}'.format(str(page_num + 1)))
props_searched = (page_num - 1)*30 + len(prop_urls)
print('Number of properties searched: {}'.format(str(props_searched)))
print('Number of properties accepted: {}'.format(str(write_row - 1)))
print('Number of properties rejected: {}'.format(str(props_searched - write_row + 1)))
print('Time taken: {} minutes'.format(str((end-start)//60)))
while True:
check = input('nTo exit, press enter')
if True:
break
python web-scraping
python web-scraping
New contributor
New contributor
New contributor
asked 6 mins ago
Josh LimJosh Lim
1
1
New contributor
New contributor
add a comment |
add a comment |
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Josh Lim is a new contributor. Be nice, and check out our Code of Conduct.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f215472%2fscraper-to-get-property-info-from-tripadvisor%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Josh Lim is a new contributor. Be nice, and check out our Code of Conduct.
Josh Lim is a new contributor. Be nice, and check out our Code of Conduct.
Josh Lim is a new contributor. Be nice, and check out our Code of Conduct.
Josh Lim is a new contributor. Be nice, and check out our Code of Conduct.
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f215472%2fscraper-to-get-property-info-from-tripadvisor%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown