2017-04-21 18 views
0

こんにちは、私はdigg.comのフロントページの画像を、次のコードで削り取ろうとしています。問題は、0.jpg〜6.jpgが正常であることです。 7.jpgから47.jpgまでは壊れています。理由は分かりません。スクラップした画像が壊れています

ここにコードがあります。ここにGithubの:https://github.com/kenpeter/py_mm

# os 
import os 
# http request 
import requests 
# 
import pprint 

import time 

# import html from lxml 
from lxml import html 

# global 
global_page_num = 0 
pp = pprint.PrettyPrinter(indent=4) 

# write to file 
def download_image(img_urls): 
    # total img urls 
    amount = len(img_urls) 

    # loop 
    for index, value in enumerate(img_urls, start=0): 
     # file name 
     filename = 'img/%s.jpg' % (index) 
     # dir 
     os.makedirs(os.path.dirname(filename), exist_ok=True) 

     print('--- start ---') 
     print('filename: %s' % filename) 
     print('Downloading: %s out of %s' % (index, amount)) 

     # open file 
     with open(filename, 'wb') as f: 
      # f write 
      # time.sleep(1) 
      f.write(requests.get(value).content) 


def get_page_number(num): 
    url = 'http://digg.com' 
    response = requests.get(url).content 
    selector = html.fromstring(response) 

    img_urls = [] 
    img_urls = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src") 

    news_texts = [] 
    news_texts = selector.xpath("//div[@itemprop='description']/text()") 

    # test 
    # print('--- something ---') 
    # pp.pprint(img_urls) 
    # pp.pprint(news_texts) 

    download_image(img_urls) 

    return img_urls 


if __name__ == '__main__': 
    # input, page_number, everything into the var 
    # page_number = input('Please enter the page number that you want to scrape:') 

    # global_page_num 
    # global_page_num = page_number; 
    # print('hell world!'); 

    page_number = 4 # hardcode 
    get_page_number(page_number) 

答えて

0

画像は「壊れて」いる理由は、ページ内のスキームの変更や画像を開始することで、あなたのコードでつかむコンテンツdata-src代わりsrcの属性で「非表示」に。ここでは、両方の属性を持つつかんページのソースコードの例を参照してください:

つまり
<img 
class="digg-story__image-img js--digg-story__image-img lazy-image-img need-offset" 
data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg" 
src="http://static.digg.com/static/fe/944294/images/x_455x248.png" 
width="312" 
height="170" 
alt="" 
/> 

あなたは両方をチェックする必要があり、画像のURLのリストを作成中srcdata-src優先しsrcdata-src属性。

このコードは、「トリック」を行い、適切な画像をダウンロードします。

# os 
import os 
# http request 
import requests 
# 
import pprint 

import time 

# import html from lxml 
from lxml import html 

# global 
global_page_num = 0 
pp = pprint.PrettyPrinter(indent=4) 

# write to file 
def download_image(img_urls): 
    # total img urls 
    amount = len(img_urls) 

    # loop 
    for index, value in enumerate(img_urls, start=0): 
     # file name 
     filename = 'img/%s.jpg' % (index) 
     # dir 
     os.makedirs(os.path.dirname(filename), exist_ok=True) 

     print('--- start ---') 
     print('filename: %s' % filename) 
     print('Downloading: %s out of %s' % (index, amount)) 

     # open file 
     with open(filename, 'wb') as f: 
      # f write 
      # time.sleep(1) 
      f.write(requests.get(value).content) 


def get_page_number(num): 
    url = 'http://digg.com' 
    response = requests.get(url).content 
    selector = html.fromstring(response) 

    img_urls = [] 
    img_urls_1a = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src") 
    img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item] 
    img_urls_2 = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@data-src") 
    img_urls = img_urls_1b + img_urls_2 
    # print(img_urls) 
    news_texts = [] 
    news_texts = selector.xpath("//div[@itemprop='description']/text()") 

    # test 
    # print('--- something ---') 
    # pp.pprint(img_urls) 
    # pp.pprint(news_texts) 

    download_image(img_urls) 

    return img_urls 


if __name__ == '__main__': 
    # input, page_number, everything into the var 
    # page_number = input('Please enter the page number that you want to scrape:') 

    # global_page_num 
    # global_page_num = page_number; 
    # print('hell world!'); 

    page_number = 4 # hardcode 
    get_page_number(page_number) 
関連する問題