2016-07-20 11 views
0

私は200以上のリンクを解析しようとしていますが、BS4は出力処理をしていません。私はそれがBeautifulsoup findall get stuck without processingを見たが、それは違う。 ランダムな場所に詰まっています。Beautifulsoup stuck

import os 
import urllib.request 
from bs4 import BeautifulSoup 
def get_html(url): 
    response = urllib.request.urlopen(url) 
    return response.read() 

def parse(html, url): 
    soup = BeautifulSoup(html, "html.parser") 
    table = soup.find_all('tr', title = "Допущено до конкурсу") 
    if os.path.exists('base/%s.txt' % url[27:]): 
     pass 
    else: 
     abitbase = open('base/%s.txt' % (url[27:]), 'w') 
     for unit in table: 
      collection = unit.find_all('td') 
      position = collection[0].text 
      name = collection[1].text 
      priority = collection[2].text 
      score = collection[3].text 
      abitbase.write('%s %s %s %s \n' % (position, name, priority, score)) 
     abitbase.close() 

def main(): 
    global applicants 
    url_list = open('clist.txt', 'r') 
    for count in range(1, 241): 
     url_s = url_list.readline() 
     if url_s[-1] == '\n': 
      url = url_s[:-1] 
     else: 
      url = url_s 
     parse(get_html(url), url) 
     print('base [%s] saved | %s%s' %(url[27:], (round((count/2.41), 2)), '%')) 

if __name__ == '__main__': 
    applicants = {} 
    main() 

そしてTimeoutError:

Traceback (most recent call last): 
    File "/usr/lib/python3.4/urllib/request.py", line 1182, in do_open 
    h.request(req.get_method(), req.selector, req.data, headers) 
    File "/usr/lib/python3.4/http/client.py", line 1088, in request 
    self._send_request(method, url, body, headers) 
    File "/usr/lib/python3.4/http/client.py", line 1126, in _send_request 
    self.endheaders(body) 
    File "/usr/lib/python3.4/http/client.py", line 1084, in endheaders 
    self._send_output(message_body) 
    File "/usr/lib/python3.4/http/client.py", line 922, in _send_output 
    self.send(msg) 
    File "/usr/lib/python3.4/http/client.py", line 857, in send 
    self.connect() 
    File "/usr/lib/python3.4/http/client.py", line 834, in connect 
    self.timeout, self.source_address) 
    File "/usr/lib/python3.4/socket.py", line 512, in create_connection 
    raise err 
    File "/usr/lib/python3.4/socket.py", line 503, in create_connection 
    sock.connect(sa) 
TimeoutError: [Errno 110] Connection timed out 

During handling of the above exception, another exception occurred: 

Traceback (most recent call last): 
    File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 58, in <module> 
    main() 
    File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 53, in main 
    parse(get_html(url), url) 
    File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 22, in get_html 
    response = urllib.request.urlopen(url) 
    File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen 
    return opener.open(url, data, timeout) 
    File "/usr/lib/python3.4/urllib/request.py", line 463, in open 
    response = self._open(req, data) 
    File "/usr/lib/python3.4/urllib/request.py", line 481, in _open 
    '_open', req) 
    File "/usr/lib/python3.4/urllib/request.py", line 441, in _call_chain 
    result = func(*args) 
    File "/usr/lib/python3.4/urllib/request.py", line 1210, in http_open 
    return self.do_open(http.client.HTTPConnection, req) 
    File "/usr/lib/python3.4/urllib/request.py", line 1184, in do_open 
    raise URLError(err) 
urllib.error.URLError: <urlopen error [Errno 110] Connection timed out> 

答えて

1

BS4が正常に動作し、それは私のせいです。

ちょうどos.path.existsparse(get_html(url), url)の前に入れても問題ありません。

申し訳ありません。