-1
私はPythonには新しく、ある関数の値を同じ関数に渡すのに苦労しています。リストの値をPythonに渡します。
コンテキスト:私はredditからデータをスクラップするWebクローラーに取り組んでいます。私は投稿を掻き集めて投稿にコメントすることができ、next_page
という変数に次のページhrefを保存しています。私の問題はnext_page
をgetPosts
に戻すことができないことです。
誰かが提供できるガイダンスは大変ありがとうございます。
import requests
import re
from bs4 import BeautifulSoup
import time
from fake_useragent import UserAgent
import datetime
#sets default endcoding
import sys
from collections import deque
reload(sys)
sys.setdefaultencoding('UTF8')
#global variables
ua = UserAgent()
dt=datetime.datetime.now()
next_page=[]
comment_url=[]
url_list=[]
############################################
#Declare functions
def getComments(comment_url):
return ('')
def getNext(next_page):
return(next_page)
############################################
#print standard output
#sys.stdout=open('reddit_output.txt','w')
############################################
print "Crawl Date: ", dt.strftime('%B %d %Y'),"\n"
print "First Post"
#############################################
####################################################################################################################################
#capture individual search results
def getPosts(next_page):
global url_list
time.sleep(2)
ua=UserAgent()
headers={'user-agent': ua.random}
#iterate through list_object of urls
url_list=["https://www.reddit.com/r/HealthInsurance/search?q=health+insurance&sort=new&t=all", next_page]
url=url_list[0]
response=requests.get(url, headers=headers)
html=response.text
soup=BeautifulSoup(html,'lxml')
comment_url=[]
#clears original url from url_list
del url_list[0]
#capture just the search results content
content=soup.findAll(class_='listing search-result-listing')[1]
#add next_page link to url_list
#capture individual posts and link to the comments
for results in content.findAll(class_='contents'):
for post in results.findAll(class_=' search-result search-result-link no-linkflair '):
for post_elements in post:
#find all 'comments' links and store in list
comment_url.append(post.find(class_="search-result-meta").find('a').attrs['href'])
#header= post.find(class_='search-results-header')
title=post.findAll(class_='search-result-header')
meta=post.findAll(class_='search-result-meta')
##print post and comments if there are any
##print '|'
#getComments(comment_url)
#capture next page link
next_page=soup.find(class_="nav-buttons").findAll('a')[0].attrs['href']
url_list.insert(0,next_page)
return comment_url
print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!next search results page!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
print (next_page)
getPosts('')
###################################################################################################################################
def getComments(comment_url):
url= getPosts(comment_url)
##iterate through comment_list
for link in url:
time.sleep(2)
ua=UserAgent()
headers={'user-agent': ua.random}
response=requests.get(link, headers=headers)
html=response.text
soup=BeautifulSoup(html, 'lxml')
original_post=soup.find(class_='sitetable linklisting')
content=soup.find(class_="sitetable nestedlisting")
#print comments
print "Original Post##########################################\n"
print original_post.get_text()
print "Comments###############################################\n"
print content.get_text()
print '|'
print "New Post++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"
getComments("")
####################################################################################################################################
#passes the next page to getPosts
print "/////////////////////next search results page//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////"
print url_list[0]
getNext(next_page)
def getNext(next_page):
url=(url_list[0])
print url
getPosts(url)
getNext(next_page)
'getNext'を上記のように宣言してから' getNext(next_page) 'を使うことができます –
また、関数内の' return'文の後のものは実行されません –