2017-05-28 13 views
0

トリップアドバイザーのスクレーパーを作って、私のpythonスキルを向上しようとしています。 現在、スクレーパーは都市のレストランを掻き集めて、名前とトリップアドバイザーのURLをExcelファイルに保存することができます。しかし、私はレストランの電子メールと直接URLを保存するためのアドバイスを探しています。tripadvisor scrapeレストランのURLと電子メール

誰でもこの情報を入力できますか? 乾杯

import requests 
 

 
from tkinter import * 
 
from bs4 import BeautifulSoup as b 
 
from bs4 import Comment as com 
 
from openpyxl import Workbook 
 
# city_name = 'London_England' 
 
# geo_code = '186338' 
 

 

 
def o_and_t(): 
 
\t global nameFile, geo_code, city_name 
 
\t nameFile = e_1.get() + '.xlsx' 
 
\t geo_code = e_2.get() 
 
\t city_name = e_3.get() 
 
\t root.destroy() 
 
\t return None 
 

 
def gui(): 
 
\t global root,e_1,e_2,e_3 
 
\t root = Tk() 
 
\t root.geometry('500x230') 
 
\t root.configure(bg = 'black') 
 
\t root.title('Enter Details') 
 
\t # 
 
\t l_0 = Label(root,text = '\t\tTripAdvisorScraper\n \t\t ~by a1b2t',font = ("Helevetica",14),bg = 'black',fg = 'white') 
 
\t l_0.place(x = 0 ,y = 0) 
 
\t # 
 
\t l_1 = Label(root,text = 'Please Enter the FileName : ',font = ("Helevetica",11),bg = 'black',fg = 'white') 
 
\t l_1.place(x = 0 , y = 60) 
 
\t # 
 
\t l_2 = Label(root,text = 'Please enter the code from the url : ',font = ("Helevetica",11),bg = 'black',fg = 'white') 
 
\t l_2.place(x = 0 , y = 90) 
 
\t # 
 
\t l_3 = Label(root,text = 'Please enter the city and country as in url :',font = ("Helevetica",12),bg = 'black',fg = 'white') 
 
\t l_3.place(x = 0,y = 120) 
 
\t # 
 
\t e_1 = Entry(root) 
 
\t e_1.place(x = 320 ,y = 60) 
 
\t # 
 
\t e_2 = Entry(root) 
 
\t e_2.place(x = 320 ,y = 90) 
 
\t # 
 
\t e_3 = Entry(root) 
 
\t e_3.place(x = 320 ,y = 120) 
 
\t # 
 
\t b_1 = Button(root,text = 'START',command = o_and_t) 
 
\t b_1.place(x = 220 ,y = 170) 
 
\t root.mainloop() 
 
\t return None 
 

 

 
gui() 
 

 
print('\n\n\tStarting Scraper\t\n\n') 
 

 

 

 
main_url = 'https://www.tripadvisor.co.uk/Restaurants-g{}-{}.html'.format(geo_code, city_name) 
 

 
'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a150&availSearchEnabled=false' 
 
'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a60&availSearchEnabled=false' 
 
'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a30&availSearchEnabled=false' 
 

 
req_1 = requests.get(main_url) 
 
soup = b(req_1.content, 'html.parser') 
 
total_pages = int(soup.find_all('a', class_="pageNum taLnk")[-1]['data-page-number']) + 1 
 
print(total_pages) 
 
RESULTS = [] 
 

 
for page_no in range(0, total_pages*30 , 30): 
 
\t page_no 
 
\t url = 'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo={}&ajax=1&itags=10591&sortOrder=relevance&o=a{}&availSearchEnabled=false'.format(geo_code, page_no) 
 
\t req_2 = requests.get(url) 
 
\t soup_2 = b(req_2.content, 'html.parser') 
 
\t temp = soup_2.find_all('a', class_="property_title") 
 
\t for t in temp: 
 
\t \t r_name = t.text.replace('\n', '').replace('\t', '') 
 

 
\t \t r_url = 'https://www.tripadvisor.com' + t['href'] 
 
\t \t print(str([r_name, r_url]).encode()) 
 
\t \t RESULTS.append([r_name, r_url]) 
 

 

 
if len(RESULTS) !=0: \t \t 
 
\t wb = Workbook(write_only=True) 
 
\t ws = wb.create_sheet() 
 
\t for steps_0 in RESULTS: 
 
\t \t ws.append(steps_0) 
 
\t wb.save(nameFile) 
 
print(len(RESULTS)) 
 
\t 
 
\t

答えて

1

あなたはちょうどあなたがつかんでいる各URLをこすり、要素クラス= "detail_section情報" を探してください

関連する問題