Python Web Scrapeサイクルタブ

ウェブサイト上のすべてのタブをループして、関連するすべての情報を取得するためのヘルプを探しています。Python Web Scrapeサイクルタブ

以下のサイトでは、5x5,5x10,5x15,10x10などのタブがいくつかあります。どのように構造化するかわからないので、タブを通過してスクリプトにループを書きます。あなたの助けに感謝します。

以下はpythonスクリプトです。

from urllib.request import urlopen as uReq 
from bs4 import BeautifulSoup as soup 
import csv 

urls = [ 
    'https://www.lifestorage.com/storage-units/florida/orlando/32810/610-near-lockhart/?size=5x5' 
] 

filename = 'life_storage.csv' 

f = open(filename, 'a+') 
csv_writer = csv.writer(f) 

headers = ['unit_size', 'unit_type', 'description', 'online_price', 'reg_price', 'store_address', 'store_city', 'store_state', 'store_postalcode' ] 

##unit_size = 5'x10' withouth the ' 
##unit_type = climate controlled or not (this could be blank if non-climate) 
##descirption = the level it's on and type of access. 
##online_price = $##/mo text 
##reg_price = the scratched off $## text 

csv_writer.writerow(headers) 

for my_url in urls: 
    uClient = uReq(my_url) 
    page_html = uClient.read() 
    uClient.close() 
    page_soup = soup(page_html, 'html.parser') 


    store_locator = page_soup.findAll("div", {"itemprop": "address"}) 
    containers = page_soup.findAll("ul", {"id": "spaceList"}) 

    for container in containers: 
     for store_location in store_locator: 
      store_address1 = store_location.find("span", {"itemprop": "streetAddress"}) 
      store_address = store_address1.text 
      store_city1 = store_location.find("span", {"itemprop": "addressLocality"}) 
      store_city = store_city1.text 
      store_state1 = store_location.find("span", {"itemprop": "addressRegion"}) 
      store_state = store_state1.text 
      store_postalcode1 = store_location.find("span", {"itemprop": "postalCode"}) 
      store_postalcode = store_postalcode1.text 
      title_container = container.find("div", {"class": "storesRow"}) 
      unit_size = title_container.text 
      unit_container = container.find("div", {"class": "storesRow"}) 
      unit_type = unit_container.strong.text 
      description_container = container.find("ul", {"class": "features"}) 
      description = description_container.text 
      online_price_container = container.find("div", {"class": "priceBox"}) 
      online_price = online_price_container.strong.text 
      reg_price_container = container.find("div", {"class": "priceBox"}) 
      reg_price = reg_price_container.i.text 

     csv_writer.writerow([unit_size, unit_type, description, online_price, reg_price, store_address, store_city, store_state, store_postalcode]) 

f.close()

以下は、ループに関連するHTML本体のスニペットです。

//////////\\\\\\\Description BOX 
 

 

 

 
<div class="storesRow"> 
 
    <strong> 
 
<a href="/reservation/choose/?store=610&amp;type=1"> 5' x 5'<sup>*</sup> - Climate Controlled </a> 
 
</strong> 
 
    <ul class="features"> 
 
     <li>Indoor access</li> 
 
     <li>Ground Level</li> 
 
    </ul> 
 
</div> 
 

 

 

 
//////////\\\\\\\\\PRICE BOX 
 

 
<div class="priceBox"> 
 
<strong> 
 

 
             $25/mo 
 

 

 

 

 

 
               <i> $27</i> 
 
</strong> 
 
<em class="pOnly ">Phone &amp; online only</em> 
 
<div class="specialsMessage"> 
 
</div> 
 
</div> 
 

 

 
//////////\\\\\\\\\ADDRESS BOX 
 

 

 
<div itemprop="address" itemscope="" itemtype="https://schema.org/PostalAddress"> 
 
<em> 
 
<i class="fa fa-map-marker"></i> 
 
<span itemprop="streetAddress">7244 Overland Rd </span> 
 
<span itemprop="addressLocality">Orlando</span>, 
 

 
     <span itemprop="addressRegion">FL</span> 
 
<span itemprop="postalCode">32810</span> 
 
</em> 
 
</div>

CURRENT OUTPUT

所望の出力

出典

2017-12-27 D-Ru

あなたの予想される出力は何ですか？ –

@ Paul - 私は現在の出力と私が持っているものを編集しました。これはすべてのタブをループするので、新しいサイズごとに適切な見出しの下の新しい行に挿入されます。 –

あなたは間違ったインデントを持っています - 'writerow（）'は 'for'の内側にあるべきです - そしてあなたのリストにすべての項目を追加するべきです。 – furas

あなたは間違ってインデントを持っている - writerow()は、内側for内にある必要があります。

しかし、項目から正しいテキストを押し出すためには、もっと多くの作業が必要になることがあります。コードを参照してください。

from urllib.request import urlopen as uReq 
from bs4 import BeautifulSoup as soup 
import csv 

urls = [ 
    'https://www.lifestorage.com/storage-units/florida/orlando/32810/610-near-lockhart/?size=5x5' 
] 

filename = 'life_storage.csv' 

f = open(filename, 'a+') 
csv_writer = csv.writer(f) 

headers = ['unit_size', 'unit_type', 'description', 'online_price', 'reg_price', 'store_address', 'store_city', 'store_state', 'store_postalcode' ] 

##unit_size = 5'x10' withouth the ' 
##unit_type = climate controlled or not (this could be blank if non-climate) 
##descirption = the level it's on and type of access. 
##online_price = $##/mo text 
##reg_price = the scratched off $## text 

csv_writer.writerow(headers) 

for my_url in urls: 
    uClient = uReq(my_url) 
    page_html = uClient.read() 
    uClient.close() 
    page_soup = soup(page_html, 'html.parser') 

    store_location = page_soup.find("div", {"itemprop": "address"}) 

    # need `li` 
    containers = page_soup.find("ul", {"id": "spaceList"}).findAll('li') 
    print('len(containers):', len(containers)) 

    item = store_location.find("span", {"itemprop": "streetAddress"}) 
    store_address = item.text.strip() 

    item = store_location.find("span", {"itemprop": "addressLocality"}) 
    store_city = item.text.strip() 

    item = store_location.find("span", {"itemprop": "addressRegion"}) 
    store_state = item.text.strip() 

    item = store_location.find("span", {"itemprop": "postalCode"}) 
    store_postalcode = item.text.strip() 

    for container in containers: 
     item = container.find("div", {"class": "storesRow"}) 

     if item and item.strong: 
      text = item.strong.text.strip() 
      parts = text.split('-') 
      if len(parts) > 0: 
       unit_size = parts[0].strip().replace('*', "") 
      else: 
       unit_size = '' 

      if len(parts) > 1: 
       unit_type = parts[1].strip() 
      else: 
       unit_type = '' 
     else: 
      continue 

     item = container.find("ul", {"class": "features"}) 

     if item: 
      description = item.text.strip().replace("\n", ',') 
     else: 
      description = '' 

     item = container.find("div", {"class": "priceBox"}) 

     if item and item.i: 
      reg_price = item.i.text.strip() 
     else: 
      reg_price = '' 

     if item and item.strong: 
      if item.i: 
       item.i.extract() # remove <i>` 
      online_price = item.strong.text.strip() 
     else: 
      online_price = '' 

     csv_writer.writerow([unit_size, unit_type, description, online_price, reg_price, store_address, store_city, store_state, store_postalcode]) 

f.close()

結果：

unit_size,unit_type,description,online_price,reg_price,store_address,store_city,store_state,store_postalcode 
5' x 5',Climate Controlled,"Indoor access,Ground Level",$25/mo,$27,7244 Overland Rd,Orlando,FL,32810 
5' x 5',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810 
5' x 10',,"Outdoor/Drive-up access,Ground Level",$46/mo,$50,7244 Overland Rd,Orlando,FL,32810 
10' x 5',Climate Controlled,"Indoor access,Ground Level",$57/mo,$62,7244 Overland Rd,Orlando,FL,32810 
5' x 10',Climate Controlled,"Indoor access,Ground Level",$67/mo,$73,7244 Overland Rd,Orlando,FL,32810 
5' x 10',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810 
5' x 15',Climate Controlled,"Indoor access,Ground Level",$69/mo,$75,7244 Overland Rd,Orlando,FL,32810 
10' x 10',,"Outdoor/Drive-up access,Ground Level",$105/mo,$115,7244 Overland Rd,Orlando,FL,32810 
10' x 10',Climate Controlled,"Indoor access,Ground Level",$105/mo,$115,7244 Overland Rd,Orlando,FL,32810 
10' x 10',Climate Controlled,"Indoor access,Ground Level",$124/mo,$136,7244 Overland Rd,Orlando,FL,32810 
10' x 15',,"Outdoor/Drive-up access,Ground Level",$144/mo,$158,7244 Overland Rd,Orlando,FL,32810 
10' x 16',,"Outdoor/Drive-up access,Ground Level",$145/mo,$159,7244 Overland Rd,Orlando,FL,32810 
10' x 15',Climate Controlled,"Indoor access,Ground Level",$149/mo,$163,7244 Overland Rd,Orlando,FL,32810 
10' x 18',,"Outdoor/Drive-up access,Ground Level",$149/mo,$163,7244 Overland Rd,Orlando,FL,32810 
10' x 15',Climate Controlled,"Indoor access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810 
10' x 20',,"Outdoor/Drive-up access,Ground Level",$147/mo,$161,7244 Overland Rd,Orlando,FL,32810 
10' x 25',Climate Controlled,"Indoor access,Ground Level",$175/mo,$192,7244 Overland Rd,Orlando,FL,32810 
10' x 20',Climate Controlled,"Indoor access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810 
10' x 28',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810 
41' x 41',,"Outdoor/Drive-up access,Ground Level",$1400/mo,$1540,7244 Overland Rd,Orlando,FL,32810 
22' x 25',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810 
18' x 38',,"Outdoor/Drive-up access,Ground Level",Check for Availability,,7244 Overland Rd,Orlando,FL,32810

出典

2017-12-28 02:50:24 furas

@ Furas - Dziękuję –

Python Web Scrapeサイクルタブ

答えて

関連する問題