Python webscraper to get property info from Tripadvisor












0












$begingroup$


I'm very much a beginner at programming, and hoping to get some advice!
I'm having some trouble with a tripadvisor scraper being slow, and have identified the part of my code that is taking a while. It's likely because of the long selector, but i'm not sure how to use anything more specific because there are randomly generated strings on the more specific selectors. Below the snippet that is taking awhile, and below that is the full code. Would appreciate any feedback!



Sample of the webpages im scraping:



https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html



https://www.tripadvisor.com.sg/Hotel_Review-g255100-d257433-Reviews-The_Hotel_Windsor-Melbourne_Victoria.html



Code giving me problems:



        num_rooms = 0
extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
for data in extra_info:
data = data.text.strip()
if data.isdigit():
num_rooms = int(data)


Full code:



import requests
from bs4 import BeautifulSoup
import xlsxwriter
import time


def get_soup(url):
r = requests.get(url)
return BeautifulSoup(r.content, 'html.parser')


def write_xlsx(items, xlsx_write_row):
write_column = 0
for item in items:
worksheet.write(xlsx_write_row, write_column, item)
write_column += 1


workbook = xlsxwriter.Workbook('Results.xlsx')
worksheet = workbook.add_worksheet()


# user variables
while True:
start_url = input('Start url: ')
if 'https://www.tripadvisor.com.sg/Hotels-' not in start_url:
print(
'Please enter a valid url. e.g https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html')
else:
break

print('fetching page...')
soup = get_soup(start_url)

while True:
min_rev_num = input('Min Reviews for property: ')
if min_rev_num.isdigit():
if int(min_rev_num) >= 0:
min_rev_num = int(min_rev_num)
break
print('Please enter a valid number')

while True:
print('Enter max number of low review number properties on a single page, from 0 to 30.')
print('(Program will exit once this condition is fulfilled)')
num_rev_criteria = input('Input: ')
if num_rev_criteria.isdigit():
if 0 <= int(num_rev_criteria) <= 30:
num_rev_criteria = int(num_rev_criteria)
break

print('Please enter a valid number')

while True:
min_star_rating = input('Min star rating for property: ')
if min_star_rating.isdigit():
if 0 <= int(min_star_rating) <= 5:
min_star_rating = float(min_star_rating)
break

print('Please enter a valid number')

while True:
min_room_num = input('Min number of rooms: ')
if min_room_num.isdigit():
if int(min_room_num) >= 0:
min_room_num = int(min_room_num)
break
print('Please enter a valid number')

while True:
max_num_pages = int(soup.select_one('.pageNum.last.taLnk').text.strip())
num_pages = input('Page to search until(1 to {}):'.format(str(max_num_pages)))
if num_pages.isdigit():
if 1 <= int(num_pages) <= max_num_pages:
num_pages = int(num_pages)
break
print('Please enter a valid number')
print('-'*30 + 'n')
check = input("Make sure 'Results.xlsx' is closed and deleted. Once you are ready, press enter")

write_row = 0
write_xlsx(['Property Details', 'Star Rating', 'Number of Rooms'], write_row)
page_url = start_url
rejected_properties = 0

start = time.time()
print('Getting data...')

# get property data
for page_num in range(num_pages):
print('nOn page {}n'.format(str(page_num + 1)))
low_review_count = 0
soup = get_soup(page_url)
if page_num != num_pages - 1:
next_page = soup.select_one('.nav.next.taLnk.ui_button.primary')['href']
page_url = 'https://www.tripadvisor.com.sg' + next_page
else:
pass
rows = soup.select('.property_title.prominent')
prop_urls =
for row in rows:
prop_urls.append('https://www.tripadvisor.com.sg' + row['href'])
for prop in prop_urls:
soup = get_soup(prop)
try:
num_reviews = int(soup.select_one('.reviewCount').text.strip().split(' ')[0].replace(',', ''))
except AttributeError:
num_reviews = 0

try:
property_name = soup.select_one('#HEADING').text.strip()
except AttributeError:
property_name = ' '

if num_reviews >= min_rev_num:

try:
star_rating_class = soup.select_one('.ui_star_rating')['class'][1]
star_rating = float(star_rating_class[5] + '.' + star_rating_class[6])
except TypeError:
star_rating = 0

num_rooms = 0
extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
for data in extra_info:
data = data.text.strip()
if data.isdigit():
num_rooms = int(data)

try:
address = soup.select_one('.street-address').text.strip() + ', ' + soup.select_one('.locality').text.strip() + soup.select_one('.country-name').text.strip()
except AttributeError:
address = ' '

try:
phone = soup.select_one('.is-hidden-mobile.detail').text.strip()
except AttributeError:
phone = ' '

if star_rating >= min_star_rating or star_rating == 0:
if num_rooms >= min_room_num or num_rooms == 0:
write_row += 1
write_xlsx([property_name + 'n' + address + 'nT: ' + phone, star_rating, num_rooms], write_row)
else:
print("Rejected: '{}'n".format(property_name) + ' - Not enough rooms:{}'.format(num_rooms))
else:
print("Rejected: '{}'n".format(property_name)+' - Not high enough star rating:{}'.format(star_rating))
else:
low_review_count += 1
print("Rejected: '{}'n".format(property_name) + ' - Not enough reviews:{}'.format(num_reviews))
print(' - Low review count: {}/{}'.format(low_review_count, num_rev_criteria))

if low_review_count >= num_rev_criteria:
print('Exiting due to low review count on page')
break

workbook.close()
end = time.time()

print("nDone! Results can be found in 'Results.xlsx' in the same foldern")
print('Results can be copied straight onto the shortlist(paste values only), formatting has already been done.')
print('If any results have 0 stars or 0 rooms, Tripadvisor does not have this data')
print('Address and phone numbers are based on Tripadvisor data as welln')
print('Number of pages searched: {}'.format(str(page_num + 1)))
props_searched = (page_num - 1)*30 + len(prop_urls)
print('Number of properties searched: {}'.format(str(props_searched)))
print('Number of properties accepted: {}'.format(str(write_row - 1)))
print('Number of properties rejected: {}'.format(str(props_searched - write_row + 1)))
print('Time taken: {} minutes'.format(str((end-start)//60)))
while True:
check = input('nTo exit, press enter')
if True:
break









share|improve this question









New contributor




Josh Lim is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.







$endgroup$

















    0












    $begingroup$


    I'm very much a beginner at programming, and hoping to get some advice!
    I'm having some trouble with a tripadvisor scraper being slow, and have identified the part of my code that is taking a while. It's likely because of the long selector, but i'm not sure how to use anything more specific because there are randomly generated strings on the more specific selectors. Below the snippet that is taking awhile, and below that is the full code. Would appreciate any feedback!



    Sample of the webpages im scraping:



    https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html



    https://www.tripadvisor.com.sg/Hotel_Review-g255100-d257433-Reviews-The_Hotel_Windsor-Melbourne_Victoria.html



    Code giving me problems:



            num_rooms = 0
    extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
    for data in extra_info:
    data = data.text.strip()
    if data.isdigit():
    num_rooms = int(data)


    Full code:



    import requests
    from bs4 import BeautifulSoup
    import xlsxwriter
    import time


    def get_soup(url):
    r = requests.get(url)
    return BeautifulSoup(r.content, 'html.parser')


    def write_xlsx(items, xlsx_write_row):
    write_column = 0
    for item in items:
    worksheet.write(xlsx_write_row, write_column, item)
    write_column += 1


    workbook = xlsxwriter.Workbook('Results.xlsx')
    worksheet = workbook.add_worksheet()


    # user variables
    while True:
    start_url = input('Start url: ')
    if 'https://www.tripadvisor.com.sg/Hotels-' not in start_url:
    print(
    'Please enter a valid url. e.g https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html')
    else:
    break

    print('fetching page...')
    soup = get_soup(start_url)

    while True:
    min_rev_num = input('Min Reviews for property: ')
    if min_rev_num.isdigit():
    if int(min_rev_num) >= 0:
    min_rev_num = int(min_rev_num)
    break
    print('Please enter a valid number')

    while True:
    print('Enter max number of low review number properties on a single page, from 0 to 30.')
    print('(Program will exit once this condition is fulfilled)')
    num_rev_criteria = input('Input: ')
    if num_rev_criteria.isdigit():
    if 0 <= int(num_rev_criteria) <= 30:
    num_rev_criteria = int(num_rev_criteria)
    break

    print('Please enter a valid number')

    while True:
    min_star_rating = input('Min star rating for property: ')
    if min_star_rating.isdigit():
    if 0 <= int(min_star_rating) <= 5:
    min_star_rating = float(min_star_rating)
    break

    print('Please enter a valid number')

    while True:
    min_room_num = input('Min number of rooms: ')
    if min_room_num.isdigit():
    if int(min_room_num) >= 0:
    min_room_num = int(min_room_num)
    break
    print('Please enter a valid number')

    while True:
    max_num_pages = int(soup.select_one('.pageNum.last.taLnk').text.strip())
    num_pages = input('Page to search until(1 to {}):'.format(str(max_num_pages)))
    if num_pages.isdigit():
    if 1 <= int(num_pages) <= max_num_pages:
    num_pages = int(num_pages)
    break
    print('Please enter a valid number')
    print('-'*30 + 'n')
    check = input("Make sure 'Results.xlsx' is closed and deleted. Once you are ready, press enter")

    write_row = 0
    write_xlsx(['Property Details', 'Star Rating', 'Number of Rooms'], write_row)
    page_url = start_url
    rejected_properties = 0

    start = time.time()
    print('Getting data...')

    # get property data
    for page_num in range(num_pages):
    print('nOn page {}n'.format(str(page_num + 1)))
    low_review_count = 0
    soup = get_soup(page_url)
    if page_num != num_pages - 1:
    next_page = soup.select_one('.nav.next.taLnk.ui_button.primary')['href']
    page_url = 'https://www.tripadvisor.com.sg' + next_page
    else:
    pass
    rows = soup.select('.property_title.prominent')
    prop_urls =
    for row in rows:
    prop_urls.append('https://www.tripadvisor.com.sg' + row['href'])
    for prop in prop_urls:
    soup = get_soup(prop)
    try:
    num_reviews = int(soup.select_one('.reviewCount').text.strip().split(' ')[0].replace(',', ''))
    except AttributeError:
    num_reviews = 0

    try:
    property_name = soup.select_one('#HEADING').text.strip()
    except AttributeError:
    property_name = ' '

    if num_reviews >= min_rev_num:

    try:
    star_rating_class = soup.select_one('.ui_star_rating')['class'][1]
    star_rating = float(star_rating_class[5] + '.' + star_rating_class[6])
    except TypeError:
    star_rating = 0

    num_rooms = 0
    extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
    for data in extra_info:
    data = data.text.strip()
    if data.isdigit():
    num_rooms = int(data)

    try:
    address = soup.select_one('.street-address').text.strip() + ', ' + soup.select_one('.locality').text.strip() + soup.select_one('.country-name').text.strip()
    except AttributeError:
    address = ' '

    try:
    phone = soup.select_one('.is-hidden-mobile.detail').text.strip()
    except AttributeError:
    phone = ' '

    if star_rating >= min_star_rating or star_rating == 0:
    if num_rooms >= min_room_num or num_rooms == 0:
    write_row += 1
    write_xlsx([property_name + 'n' + address + 'nT: ' + phone, star_rating, num_rooms], write_row)
    else:
    print("Rejected: '{}'n".format(property_name) + ' - Not enough rooms:{}'.format(num_rooms))
    else:
    print("Rejected: '{}'n".format(property_name)+' - Not high enough star rating:{}'.format(star_rating))
    else:
    low_review_count += 1
    print("Rejected: '{}'n".format(property_name) + ' - Not enough reviews:{}'.format(num_reviews))
    print(' - Low review count: {}/{}'.format(low_review_count, num_rev_criteria))

    if low_review_count >= num_rev_criteria:
    print('Exiting due to low review count on page')
    break

    workbook.close()
    end = time.time()

    print("nDone! Results can be found in 'Results.xlsx' in the same foldern")
    print('Results can be copied straight onto the shortlist(paste values only), formatting has already been done.')
    print('If any results have 0 stars or 0 rooms, Tripadvisor does not have this data')
    print('Address and phone numbers are based on Tripadvisor data as welln')
    print('Number of pages searched: {}'.format(str(page_num + 1)))
    props_searched = (page_num - 1)*30 + len(prop_urls)
    print('Number of properties searched: {}'.format(str(props_searched)))
    print('Number of properties accepted: {}'.format(str(write_row - 1)))
    print('Number of properties rejected: {}'.format(str(props_searched - write_row + 1)))
    print('Time taken: {} minutes'.format(str((end-start)//60)))
    while True:
    check = input('nTo exit, press enter')
    if True:
    break









    share|improve this question









    New contributor




    Josh Lim is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.







    $endgroup$















      0












      0








      0





      $begingroup$


      I'm very much a beginner at programming, and hoping to get some advice!
      I'm having some trouble with a tripadvisor scraper being slow, and have identified the part of my code that is taking a while. It's likely because of the long selector, but i'm not sure how to use anything more specific because there are randomly generated strings on the more specific selectors. Below the snippet that is taking awhile, and below that is the full code. Would appreciate any feedback!



      Sample of the webpages im scraping:



      https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html



      https://www.tripadvisor.com.sg/Hotel_Review-g255100-d257433-Reviews-The_Hotel_Windsor-Melbourne_Victoria.html



      Code giving me problems:



              num_rooms = 0
      extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
      for data in extra_info:
      data = data.text.strip()
      if data.isdigit():
      num_rooms = int(data)


      Full code:



      import requests
      from bs4 import BeautifulSoup
      import xlsxwriter
      import time


      def get_soup(url):
      r = requests.get(url)
      return BeautifulSoup(r.content, 'html.parser')


      def write_xlsx(items, xlsx_write_row):
      write_column = 0
      for item in items:
      worksheet.write(xlsx_write_row, write_column, item)
      write_column += 1


      workbook = xlsxwriter.Workbook('Results.xlsx')
      worksheet = workbook.add_worksheet()


      # user variables
      while True:
      start_url = input('Start url: ')
      if 'https://www.tripadvisor.com.sg/Hotels-' not in start_url:
      print(
      'Please enter a valid url. e.g https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html')
      else:
      break

      print('fetching page...')
      soup = get_soup(start_url)

      while True:
      min_rev_num = input('Min Reviews for property: ')
      if min_rev_num.isdigit():
      if int(min_rev_num) >= 0:
      min_rev_num = int(min_rev_num)
      break
      print('Please enter a valid number')

      while True:
      print('Enter max number of low review number properties on a single page, from 0 to 30.')
      print('(Program will exit once this condition is fulfilled)')
      num_rev_criteria = input('Input: ')
      if num_rev_criteria.isdigit():
      if 0 <= int(num_rev_criteria) <= 30:
      num_rev_criteria = int(num_rev_criteria)
      break

      print('Please enter a valid number')

      while True:
      min_star_rating = input('Min star rating for property: ')
      if min_star_rating.isdigit():
      if 0 <= int(min_star_rating) <= 5:
      min_star_rating = float(min_star_rating)
      break

      print('Please enter a valid number')

      while True:
      min_room_num = input('Min number of rooms: ')
      if min_room_num.isdigit():
      if int(min_room_num) >= 0:
      min_room_num = int(min_room_num)
      break
      print('Please enter a valid number')

      while True:
      max_num_pages = int(soup.select_one('.pageNum.last.taLnk').text.strip())
      num_pages = input('Page to search until(1 to {}):'.format(str(max_num_pages)))
      if num_pages.isdigit():
      if 1 <= int(num_pages) <= max_num_pages:
      num_pages = int(num_pages)
      break
      print('Please enter a valid number')
      print('-'*30 + 'n')
      check = input("Make sure 'Results.xlsx' is closed and deleted. Once you are ready, press enter")

      write_row = 0
      write_xlsx(['Property Details', 'Star Rating', 'Number of Rooms'], write_row)
      page_url = start_url
      rejected_properties = 0

      start = time.time()
      print('Getting data...')

      # get property data
      for page_num in range(num_pages):
      print('nOn page {}n'.format(str(page_num + 1)))
      low_review_count = 0
      soup = get_soup(page_url)
      if page_num != num_pages - 1:
      next_page = soup.select_one('.nav.next.taLnk.ui_button.primary')['href']
      page_url = 'https://www.tripadvisor.com.sg' + next_page
      else:
      pass
      rows = soup.select('.property_title.prominent')
      prop_urls =
      for row in rows:
      prop_urls.append('https://www.tripadvisor.com.sg' + row['href'])
      for prop in prop_urls:
      soup = get_soup(prop)
      try:
      num_reviews = int(soup.select_one('.reviewCount').text.strip().split(' ')[0].replace(',', ''))
      except AttributeError:
      num_reviews = 0

      try:
      property_name = soup.select_one('#HEADING').text.strip()
      except AttributeError:
      property_name = ' '

      if num_reviews >= min_rev_num:

      try:
      star_rating_class = soup.select_one('.ui_star_rating')['class'][1]
      star_rating = float(star_rating_class[5] + '.' + star_rating_class[6])
      except TypeError:
      star_rating = 0

      num_rooms = 0
      extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
      for data in extra_info:
      data = data.text.strip()
      if data.isdigit():
      num_rooms = int(data)

      try:
      address = soup.select_one('.street-address').text.strip() + ', ' + soup.select_one('.locality').text.strip() + soup.select_one('.country-name').text.strip()
      except AttributeError:
      address = ' '

      try:
      phone = soup.select_one('.is-hidden-mobile.detail').text.strip()
      except AttributeError:
      phone = ' '

      if star_rating >= min_star_rating or star_rating == 0:
      if num_rooms >= min_room_num or num_rooms == 0:
      write_row += 1
      write_xlsx([property_name + 'n' + address + 'nT: ' + phone, star_rating, num_rooms], write_row)
      else:
      print("Rejected: '{}'n".format(property_name) + ' - Not enough rooms:{}'.format(num_rooms))
      else:
      print("Rejected: '{}'n".format(property_name)+' - Not high enough star rating:{}'.format(star_rating))
      else:
      low_review_count += 1
      print("Rejected: '{}'n".format(property_name) + ' - Not enough reviews:{}'.format(num_reviews))
      print(' - Low review count: {}/{}'.format(low_review_count, num_rev_criteria))

      if low_review_count >= num_rev_criteria:
      print('Exiting due to low review count on page')
      break

      workbook.close()
      end = time.time()

      print("nDone! Results can be found in 'Results.xlsx' in the same foldern")
      print('Results can be copied straight onto the shortlist(paste values only), formatting has already been done.')
      print('If any results have 0 stars or 0 rooms, Tripadvisor does not have this data')
      print('Address and phone numbers are based on Tripadvisor data as welln')
      print('Number of pages searched: {}'.format(str(page_num + 1)))
      props_searched = (page_num - 1)*30 + len(prop_urls)
      print('Number of properties searched: {}'.format(str(props_searched)))
      print('Number of properties accepted: {}'.format(str(write_row - 1)))
      print('Number of properties rejected: {}'.format(str(props_searched - write_row + 1)))
      print('Time taken: {} minutes'.format(str((end-start)//60)))
      while True:
      check = input('nTo exit, press enter')
      if True:
      break









      share|improve this question









      New contributor




      Josh Lim is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.







      $endgroup$




      I'm very much a beginner at programming, and hoping to get some advice!
      I'm having some trouble with a tripadvisor scraper being slow, and have identified the part of my code that is taking a while. It's likely because of the long selector, but i'm not sure how to use anything more specific because there are randomly generated strings on the more specific selectors. Below the snippet that is taking awhile, and below that is the full code. Would appreciate any feedback!



      Sample of the webpages im scraping:



      https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html



      https://www.tripadvisor.com.sg/Hotel_Review-g255100-d257433-Reviews-The_Hotel_Windsor-Melbourne_Victoria.html



      Code giving me problems:



              num_rooms = 0
      extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
      for data in extra_info:
      data = data.text.strip()
      if data.isdigit():
      num_rooms = int(data)


      Full code:



      import requests
      from bs4 import BeautifulSoup
      import xlsxwriter
      import time


      def get_soup(url):
      r = requests.get(url)
      return BeautifulSoup(r.content, 'html.parser')


      def write_xlsx(items, xlsx_write_row):
      write_column = 0
      for item in items:
      worksheet.write(xlsx_write_row, write_column, item)
      write_column += 1


      workbook = xlsxwriter.Workbook('Results.xlsx')
      worksheet = workbook.add_worksheet()


      # user variables
      while True:
      start_url = input('Start url: ')
      if 'https://www.tripadvisor.com.sg/Hotels-' not in start_url:
      print(
      'Please enter a valid url. e.g https://www.tripadvisor.com.sg/Hotels-g255100-Melbourne_Victoria-Hotels.html')
      else:
      break

      print('fetching page...')
      soup = get_soup(start_url)

      while True:
      min_rev_num = input('Min Reviews for property: ')
      if min_rev_num.isdigit():
      if int(min_rev_num) >= 0:
      min_rev_num = int(min_rev_num)
      break
      print('Please enter a valid number')

      while True:
      print('Enter max number of low review number properties on a single page, from 0 to 30.')
      print('(Program will exit once this condition is fulfilled)')
      num_rev_criteria = input('Input: ')
      if num_rev_criteria.isdigit():
      if 0 <= int(num_rev_criteria) <= 30:
      num_rev_criteria = int(num_rev_criteria)
      break

      print('Please enter a valid number')

      while True:
      min_star_rating = input('Min star rating for property: ')
      if min_star_rating.isdigit():
      if 0 <= int(min_star_rating) <= 5:
      min_star_rating = float(min_star_rating)
      break

      print('Please enter a valid number')

      while True:
      min_room_num = input('Min number of rooms: ')
      if min_room_num.isdigit():
      if int(min_room_num) >= 0:
      min_room_num = int(min_room_num)
      break
      print('Please enter a valid number')

      while True:
      max_num_pages = int(soup.select_one('.pageNum.last.taLnk').text.strip())
      num_pages = input('Page to search until(1 to {}):'.format(str(max_num_pages)))
      if num_pages.isdigit():
      if 1 <= int(num_pages) <= max_num_pages:
      num_pages = int(num_pages)
      break
      print('Please enter a valid number')
      print('-'*30 + 'n')
      check = input("Make sure 'Results.xlsx' is closed and deleted. Once you are ready, press enter")

      write_row = 0
      write_xlsx(['Property Details', 'Star Rating', 'Number of Rooms'], write_row)
      page_url = start_url
      rejected_properties = 0

      start = time.time()
      print('Getting data...')

      # get property data
      for page_num in range(num_pages):
      print('nOn page {}n'.format(str(page_num + 1)))
      low_review_count = 0
      soup = get_soup(page_url)
      if page_num != num_pages - 1:
      next_page = soup.select_one('.nav.next.taLnk.ui_button.primary')['href']
      page_url = 'https://www.tripadvisor.com.sg' + next_page
      else:
      pass
      rows = soup.select('.property_title.prominent')
      prop_urls =
      for row in rows:
      prop_urls.append('https://www.tripadvisor.com.sg' + row['href'])
      for prop in prop_urls:
      soup = get_soup(prop)
      try:
      num_reviews = int(soup.select_one('.reviewCount').text.strip().split(' ')[0].replace(',', ''))
      except AttributeError:
      num_reviews = 0

      try:
      property_name = soup.select_one('#HEADING').text.strip()
      except AttributeError:
      property_name = ' '

      if num_reviews >= min_rev_num:

      try:
      star_rating_class = soup.select_one('.ui_star_rating')['class'][1]
      star_rating = float(star_rating_class[5] + '.' + star_rating_class[6])
      except TypeError:
      star_rating = 0

      num_rooms = 0
      extra_info = soup.select('#taplc_about_addendum_react_0 div div div div')
      for data in extra_info:
      data = data.text.strip()
      if data.isdigit():
      num_rooms = int(data)

      try:
      address = soup.select_one('.street-address').text.strip() + ', ' + soup.select_one('.locality').text.strip() + soup.select_one('.country-name').text.strip()
      except AttributeError:
      address = ' '

      try:
      phone = soup.select_one('.is-hidden-mobile.detail').text.strip()
      except AttributeError:
      phone = ' '

      if star_rating >= min_star_rating or star_rating == 0:
      if num_rooms >= min_room_num or num_rooms == 0:
      write_row += 1
      write_xlsx([property_name + 'n' + address + 'nT: ' + phone, star_rating, num_rooms], write_row)
      else:
      print("Rejected: '{}'n".format(property_name) + ' - Not enough rooms:{}'.format(num_rooms))
      else:
      print("Rejected: '{}'n".format(property_name)+' - Not high enough star rating:{}'.format(star_rating))
      else:
      low_review_count += 1
      print("Rejected: '{}'n".format(property_name) + ' - Not enough reviews:{}'.format(num_reviews))
      print(' - Low review count: {}/{}'.format(low_review_count, num_rev_criteria))

      if low_review_count >= num_rev_criteria:
      print('Exiting due to low review count on page')
      break

      workbook.close()
      end = time.time()

      print("nDone! Results can be found in 'Results.xlsx' in the same foldern")
      print('Results can be copied straight onto the shortlist(paste values only), formatting has already been done.')
      print('If any results have 0 stars or 0 rooms, Tripadvisor does not have this data')
      print('Address and phone numbers are based on Tripadvisor data as welln')
      print('Number of pages searched: {}'.format(str(page_num + 1)))
      props_searched = (page_num - 1)*30 + len(prop_urls)
      print('Number of properties searched: {}'.format(str(props_searched)))
      print('Number of properties accepted: {}'.format(str(write_row - 1)))
      print('Number of properties rejected: {}'.format(str(props_searched - write_row + 1)))
      print('Time taken: {} minutes'.format(str((end-start)//60)))
      while True:
      check = input('nTo exit, press enter')
      if True:
      break






      python web-scraping






      share|improve this question









      New contributor




      Josh Lim is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.











      share|improve this question









      New contributor




      Josh Lim is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.









      share|improve this question




      share|improve this question








      edited 13 mins ago







      Josh Lim













      New contributor




      Josh Lim is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.









      asked 1 hour ago









      Josh LimJosh Lim

      11




      11




      New contributor




      Josh Lim is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.





      New contributor





      Josh Lim is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.






      Josh Lim is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.






















          0






          active

          oldest

          votes











          Your Answer





          StackExchange.ifUsing("editor", function () {
          return StackExchange.using("mathjaxEditing", function () {
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
          });
          });
          }, "mathjax-editing");

          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "196"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: false,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });






          Josh Lim is a new contributor. Be nice, and check out our Code of Conduct.










          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f215472%2fpython-webscraper-to-get-property-info-from-tripadvisor%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          0






          active

          oldest

          votes








          0






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes








          Josh Lim is a new contributor. Be nice, and check out our Code of Conduct.










          draft saved

          draft discarded


















          Josh Lim is a new contributor. Be nice, and check out our Code of Conduct.













          Josh Lim is a new contributor. Be nice, and check out our Code of Conduct.












          Josh Lim is a new contributor. Be nice, and check out our Code of Conduct.
















          Thanks for contributing an answer to Code Review Stack Exchange!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          Use MathJax to format equations. MathJax reference.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f215472%2fpython-webscraper-to-get-property-info-from-tripadvisor%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Costa Masnaga

          Fotorealismo

          Sidney Franklin