私のユニウェブサイトからいくつかのデータをスクラップしようとしています。リクエストとlxml |それのためのhtml。私はbeautifulsoup4で動作するように使用されるが、それはlxmlのと私の初めてです、私の使い方request.get()が動作していません
のための十分な迅速ではないと私は、このエラーました:ここ
from lxml import html
import requests
import json
import logging
url = 'https://example.com/'
url_ajax = "https://example.com//webapps/portal/execute/tabs/tabAction"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest'
}
#data of url link
'payload = {
'user_id': 'myid',
'password': 'mypass'
}
#data of cources (ajax call)
course_data = {
'action' : 'refreshAjaxModule',
'modId' : '_27_1',
'tabId' : '_1_1' ,
'tab_tab_group_id' : '_1_1'
}
# make sure that links are working fine
# Enabling debugging at http.client level (requests->urllib3->http.client)
# you will see the REQUEST, including HEADERS and DATA, and RESPONSE with HEADERS but without DATA.
# the only thing missing will be the response.body which is not logged.
"""try: # for Python 3
from http.client import HTTPConnection
except ImportError:
from httplib import HTTPConnection
HTTPConnection.debuglevel = 1
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True
"""
# start the script
session = requests.Session()
#go to the root url and post the username and password
session.post(url ,headers=headers,data=payload)
# get the data of cources
urlajax = session.post(url_ajax , headers=headers, data= course_data) #get the ajax call
page = requests.get(urlajax)
page.json() # This *call* raises an exception if JSON decoding fails
# here is my error
content = page.content
tree = html.fromstring(content)
ga = tree.xpath('//div[@id="div_27_1"]//div[@id="_27_1termCourses__8_1"]/ul/li[1]/a/text()')
print(ga)
を私のエラーです:
File "scrape.py", line 56, in <module>
page = requests.get(urlajax)
File "C:\Users\HozRifai\Desktop\WEBSCR~1\lib\site-packages\requests\api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\HozRifai\Desktop\WEBSCR~1\lib\site-packages\requests\api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\HozRifai\Desktop\WEBSCR~1\lib\site-packages\requests\sessions.py", line 494, in request
prep = self.prepare_request(req)
File "C:\Users\HozRifai\Desktop\WEBSCR~1\lib\site-packages\requests\sessions.py", line 437, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Users\HozRifai\Desktop\WEBSCR~1\lib\site-packages\requests\models.py", line 305, in prepare
self.prepare_url(url, params)
File "C:\Users\HozRifai\Desktop\WEBSCR~1\lib\site-packages\requests\models.py", line 379, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '<Response [200]>': No schema supplied. Perhaps you meant http://<Response [200]>?
はい私のユニウェブサイトは...です。私はそれを再設計しようとしている 私はbeautifuksoup4を使用して同じコードを試して、すべてがうまく動作します:) –