0
私は200以上のリンクを解析しようとしていますが、BS4は出力処理をしていません。私はそれがBeautifulsoup findall get stuck without processingを見たが、それは違う。 ランダムな場所に詰まっています。Beautifulsoup stuck
import os
import urllib.request
from bs4 import BeautifulSoup
def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse(html, url):
soup = BeautifulSoup(html, "html.parser")
table = soup.find_all('tr', title = "Допущено до конкурсу")
if os.path.exists('base/%s.txt' % url[27:]):
pass
else:
abitbase = open('base/%s.txt' % (url[27:]), 'w')
for unit in table:
collection = unit.find_all('td')
position = collection[0].text
name = collection[1].text
priority = collection[2].text
score = collection[3].text
abitbase.write('%s %s %s %s \n' % (position, name, priority, score))
abitbase.close()
def main():
global applicants
url_list = open('clist.txt', 'r')
for count in range(1, 241):
url_s = url_list.readline()
if url_s[-1] == '\n':
url = url_s[:-1]
else:
url = url_s
parse(get_html(url), url)
print('base [%s] saved | %s%s' %(url[27:], (round((count/2.41), 2)), '%'))
if __name__ == '__main__':
applicants = {}
main()
そしてTimeoutError:
Traceback (most recent call last):
File "/usr/lib/python3.4/urllib/request.py", line 1182, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.4/http/client.py", line 1088, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.4/http/client.py", line 1126, in _send_request
self.endheaders(body)
File "/usr/lib/python3.4/http/client.py", line 1084, in endheaders
self._send_output(message_body)
File "/usr/lib/python3.4/http/client.py", line 922, in _send_output
self.send(msg)
File "/usr/lib/python3.4/http/client.py", line 857, in send
self.connect()
File "/usr/lib/python3.4/http/client.py", line 834, in connect
self.timeout, self.source_address)
File "/usr/lib/python3.4/socket.py", line 512, in create_connection
raise err
File "/usr/lib/python3.4/socket.py", line 503, in create_connection
sock.connect(sa)
TimeoutError: [Errno 110] Connection timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 58, in <module>
main()
File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 53, in main
parse(get_html(url), url)
File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 22, in get_html
response = urllib.request.urlopen(url)
File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.4/urllib/request.py", line 463, in open
response = self._open(req, data)
File "/usr/lib/python3.4/urllib/request.py", line 481, in _open
'_open', req)
File "/usr/lib/python3.4/urllib/request.py", line 441, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 1210, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.4/urllib/request.py", line 1184, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 110] Connection timed out>