は、私は、クエリsoup.find_all("li", {"class" : "first_result"})
なぜ美しいスープは複数のクラスでこの要素を見つけられないのですか?
要素は、ページ上の間違いであるを使用して<li class="result first_result">
ように見える要素を選択しようとしているが、私は私のスクリプトを実行したときにそれが現れていません。私もレコードのためにsoup.find_all("li", {"class" : "result first_result"})
を試しましたが、まだ何もありません。
私は間違っていますか?
編集:alecxeさんのリクエスト私はそれまでのコードを投稿しました。私はPython 3.4を使用している64ビット版のWindows 7を使用していますが、これが原因です。私はこの質問をした特定の部分が###METACRITIC STUFF###
from bs4 import BeautifulSoup
from urllib3 import poolmanager
import csv
import requests
import sys
import os
import codecs
import re
import html5lib
import math
import time
from random import randint
connectBuilder = poolmanager.PoolManager()
inputstring = sys.argv[1] #argv string MUST use double quotes
inputarray = re.split('\s+',inputstring)
##########################KAT STUFF########################
katstring = ""
for item in inputarray: katstring += (item + "+")
katstring=katstring[:-1]
#kataddress = "https://kat.cr/usearch/?q=" + katstring #ALL kat
kataddress = "https://kat.cr/usearch/" + inputstring + " category:tv/?field=seeders&sorder=desc" #JUST TV kat
#print(kataddress)
numSeedsArray = []
numLeechArray = []
r = requests.get(kataddress)
soup = BeautifulSoup(r.content, "html5lib")
totalpages = [h2.find('span') for h2 in soup.findAll('h2')][0].text #get a string that looks like 'house of cards results 1-25 from 178'
totalpages = int(totalpages[-4:]) #slice off everything but the total # of pages
totalpages = math.floor(totalpages/25)
#print("totalpages= "+str(totalpages))
iteration=0
savedpage = ""
def getdata(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
global numSeedsArray
global numLeechArray
tds = soup.findAll("td", { "class" : "green center" })
numSeedsArray += [int(td.text) for td in tds]
tds = soup.findAll("td", { "class" : "red lasttd center"})
numLeechArray += [int(td.text) for td in tds]
#print(numSeedsArray)
def getnextpage(url):
global iteration
global savedpage
#print("url examined= "+url)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
nextpagelinks = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton" })
nextpagelinks = [link.get('href') for link in nextpagelinks]
#print(nextpagelinks)
activepage = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton active" })
#print("activepage= " +activepage[0].text)
currentpagenum = activepage[0].text
#print("currentpagenum= "+currentpagenum)
if len(currentpagenum)==1 and iteration>1:
nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
#print("nextpage= "+nextpage)
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
elif len(currentpagenum)==1 and iteration<=1:
nextpage = str(nextpagelinks[0][:-28])+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
savedpage = str(nextpagelinks[0][:-28])
#print("savedpage= "+savedpage)
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
elif len(currentpagenum)==2:
nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
#print("nextpage= "+nextpage)
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
return nextpage
if totalpages<2:
while iteration < totalpages-1: #should be totalpages-1 for max accuracy
getdata(kataddress)
iteration+=1
kataddress = getnextpage(kataddress)
else:
while iteration < 2: #should be totalpages-1 for max accuracy
getdata(kataddress)
iteration+=1
kataddress = getnextpage(kataddress)
# print(str(sum(numSeedsArray)))
# print(str(sum(numLeechArray)))
print(str(sum(numLeechArray)+sum(numSeedsArray)))
def getgoogdata(title):
title = re.sub(r' ', '+', title)
url = 'https://www.google.com/search?q=' +title+ '&ie=utf-8&oe=utf-8'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
resultnum = soup.find("div", {"id": "resultStats"}).text[:-14]
s2 = resultnum.replace(',', '')
resultnum = re.findall(r'\b\d+\b', s2)
print(resultnum)
getgoogdata(inputstring)
####################METACRITIC STUFF#########################
metainputstring = ""
for item in inputarray:
metainputstring += item + " "
metainputstring = metainputstring[:-1]
metacriticaddress = "http://www.metacritic.com/search/tv/" + metainputstring + "/results"
print (metacriticaddress)
r = requests.get(metacriticaddress)
soup = BeautifulSoup(r.content, "html5lib")
first_result = soup.find_all("li", attrs={"class" : "first_result"})
# first_result = soup.select("li.result.first_result")
print(first_result)
でしたこれまでの完全なコードを投稿しますか? – alecxe
問題ありません。 –