def get_user_data(self,start_url):
html = self.session.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
all_user = selector.xpath('//div[contains(@class,"c") and contains(@id,"M")]')
for i in all_user:
user_id = i.xpath('./div[1]/a[@class="nk"]/@href')[0]
content = i.xpath('./div[1]/span[1]')[0]
contents = content.xpath('string(.)')
times = i.xpath('./div/span[@class="ct"]/text()')[0]
if len(i.xpath('./div[3]')):
imgages = i.xpath('./div[2]/a/img/@src')
praise_num = i.xpath('./div[3]/a[2]/text()')
transmit_num = i.xpath('./div[3]/a[3]/text()')
elif len(i.xpath('./div[2]')):
imgages = i.xpath('./div[2]/a/img/@src')
praise_num = i.xpath('./div[2]/a[3]/text()')
transmit_num = i.xpath('./div[2]/a[4]/text()')
else :
imgages = ''
praise_num = i.xpath('./div[1]/a[2]/text()')
transmit_num = i.xpath('./div[1]/a[3]/text()')
try:
if re.search('from',times.encode().decode('utf-8')):
month_day, time, device = times.split(maxsplit=2)
self.data['mobile_phone'] = device
else:
time,device = times.split(maxsplit=1)
self.data['month_day'] = ''
self.data['create_time'] = month_day + ' ' + time
except Exception as e:
print('failure:',e)
self.data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.data['user_id'] = user_id
self.data['contents'] = contents.encode().decode('utf-8').replace('\u200b','')
self.data['imgages'] = imgages
self.data['praise_num'] = praise_num
self.data['transmit_num'] = transmit_num
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')
データの各ページを取得してデータに保存しようとしています。しかし、私は 'a.txt'の各ページに1つのデータしか保存していないので、 。どのように 'a.txt'にデータのすべてのページを正しく保存するために書き込むのですか?それだけであなたは、ループの各反復でself.data
でさまざまな値を上書きしているPythonは1行のデータしか保存しません
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')