2016-10-08 7 views
1

ウェブサイトから詳細ページの画像を取得しようとしています。私はリンクを得るためにrssのリンク機能を使用しています。これは私のコード画像の詳細ページをスクラップしようとするとエラーが発生する

@app.task 
def pan_task(): 
    url = 'http://feeds.example.com/reuters/technologyNews' 
    name = 'noticiassin' 
    live_leaks = [i for i in feedparser.parse(url).entries][:10] 
    the_count = len(live_leaks) 
    ky = feedparser.parse(url).keys() 
    oky = [i.keys() for i in feedparser.parse(url).entries][1] # shows what I can pull 

    def make_soup(url): 
     def swappo(): 
      user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" ' 
      user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" ' 
      user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" ' 
      user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" ' 

      agent_list = [user_one, user_two, user_thr, user_for] 
      a = random.choice(agent_list) 
      return a 
     headers = { 
      "user-agent": swappo(), 
      "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
      "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", 
      "accept-encoding": "gzip,deflate,sdch", 
      "accept-language": "en-US,en;q=0.8", 
     } 
     the_comments_page = requests.get(url, headers=headers) 
     soupdata = BeautifulSoup(the_comments_page.text, 'html5lib') 
     # comment = soupdata.find('a').get('src') 
     # para = comment.find_all('p') 
     # kids = [child.text for child in para] 
     # blu = str(kids).strip('[]') 
     return soupdata 

    try: 
     live_entries = [{'href': live_leak.links[0]['href']} for live_leak in live_leaks] 
     o = make_soup(live_entries) 
    except IndexError: 
     print('error check logs') 
     live_entries = [] 

    return print(o) 

ですが、私がしようとすると、この動作しないwhhy私はこのエラー

[2016-10-07 21:10:58,019: ERROR/MainProcess] Task blog.tasks.pan_task[f43ed360-c06e-4a4b-95ab-4f44a4564afa] raised unexpected: InvalidSchema("No connection adapters were found for '[{'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/AA1uAIpygjQ/us-apple-samsung-elec-appeal-idUSKCN1271LF'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/Nz28cqiuS0Y/us-google-pixel-advertising-idUSKCN12721U'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/POLoFj22hc4/us-yahoo-nsa-order-idUSKCN12800D'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/eF-XlhlQl-s/us-fcc-dataservices-idUSKCN1271RB'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/hNf9IQ3rXjw/us-autonomous-nauto-idUSKCN1271FX'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/NXkk5WfWVhM/us-sony-sensors-idUSKCN1270EC'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/gdBvoarqQro/us-yahoo-discrimination-lawsuit-idUSKCN12800K'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/nt8K--27bDg/us-thomsonreuters-ceo-idUSKCN1271DQ'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/f8z3eQg2Fpo/us-snapchat-ipo-idUSKCN12627S'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/rr4vdLsC11Y/us-samsung-elec-results-idUSKCN1262NO'}]'",) 
Traceback (most recent call last): 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/celery/app/trace.py", line 240, in trace_task 
    R = retval = fun(*args, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/celery/app/trace.py", line 438, in __protected_call__ 
    return self.run(*args, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/src/blog/tasks.py", line 134, in pan_task 
    o = make_soup(live_entries) 
    File "/Users/ray/Desktop/myheroku/practice/src/blog/tasks.py", line 124, in make_soup 
    the_comments_page = requests.get(url, headers=headers) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/api.py", line 67, in get 
    return request('get', url, params=params, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/api.py", line 53, in request 
    return session.request(method=method, url=url, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 468, in request 
    resp = self.send(prep, **send_kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 570, in send 
    adapter = self.get_adapter(url=request.url) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 644, in get_adapter 
    raise InvalidSchema("No connection adapters were found for '%s'" % url) 
requests.exceptions.InvalidSchema: No connection adapters were found for '[{'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/AA1uAIpygjQ/us-apple-samsung-elec-appeal-idUSKCN1271LF'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/Nz28cqiuS0Y/us-google-pixel-advertising-idUSKCN12721U'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/POLoFj22hc4/us-yahoo-nsa-order-idUSKCN12800D'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/eF-XlhlQl-s/us-fcc-dataservices-idUSKCN1271RB'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/hNf9IQ3rXjw/us-autonomous-nauto-idUSKCN1271FX'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/NXkk5WfWVhM/us-sony-sensors-idUSKCN1270EC'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/gdBvoarqQro/us-yahoo-discrimination-lawsuit-idUSKCN12800K'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/nt8K--27bDg/us-thomsonreuters-ceo-idUSKCN1271DQ'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/f8z3eQg2Fpo/us-snapchat-ipo-idUSKCN12627S'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/rr4vdLsC11Y/us-samsung-elec-results-idUSKCN1262NO'}]' 

を取得しますか?私は別のプログラムでこの関数を使用します。

+1

要求が単一のURLがかかりますが、あなたはそれを辞書のリストを渡しています。 – miah

+0

私は同じようなプログラムでその関数を使用しました。唯一の違いは、URLを取得するためにフィードパーサーを使用しないことです。どうすればそれを動作させることができますか? – losee

答えて

-1

あなたはこのような何かをする必要があります。

@app.task 
def pan_task(): 
    url = 'http://feeds.example.com/reuters/technologyNews' 
    name = 'noticiassin' 
    live_leaks = [i for i in feedparser.parse(url).entries][:10] 
    the_count = len(live_leaks) 
    ky = feedparser.parse(url).keys() 
    oky = [i.keys() for i in feedparser.parse(url).entries][1] # shows what I can pull 

    def make_soup(url): 
     def swappo(): 
      user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" ' 
      user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" ' 
      user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" ' 
      user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" ' 

      agent_list = [user_one, user_two, user_thr, user_for] 
      a = random.choice(agent_list) 
      return a 
     headers = { 
      "user-agent": swappo(), 
      "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
      "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", 
      "accept-encoding": "gzip,deflate,sdch", 
      "accept-language": "en-US,en;q=0.8", 
     } 
     the_comments_page = requests.get(url, headers=headers) 
     soupdata = BeautifulSoup(the_comments_page.text, 'html5lib') 
     # comment = soupdata.find('div') 
     # para = comment.find_all('p') 
     # kids = [child.text for child in para] 
     # blu = str(kids).strip('[]') 
     return soupdata 

    live_entries = [] 
    try: 
     for live_leak in live_leaks: 
      live_entries.append(make_soup(live_leak.links[0]['href'])) 
      # Do what ever you need to do to o here 
    except IndexError: 
     print('error check logs') 
     live_entries = [] 
    return live_entries 
+0

ただ空のリストを返す[] – losee

+0

何を返すのですか? – miah

+0

詳細ページのsrcをつかんで、使えるようにします – losee

関連する問題