これは私が実行している非常に不思議なバグです。 JsonReponse
を使用して応答したいJSONオブジェクトを生成するPythonスクリプトがあります。この応答は正常に返ってきますが、オブジェクトの複数の部分が含まれていることもあり、時にはそれが含まれていることもあります。どのようにこれが可能であるか分かりません。Django JsonResponseは重複したオブジェクトを時々送信します。
これはグラフのノードとエッジに対してJSONを送信するWebクローラーで、重複したノードやエッジが一見無作為に戻ってきます。 Webクローリングを行い、JSONを生成するスクリプトは毎回正しく動作しますが、問題が発生したことをDjangoに要求するコンテキストでのみ使用されます。
私のviews.pyには、Webクローラースクリプトを処理するこのような機能があります。
def webcrawler(request):
source = request.GET.get('source')
method = request.GET.get('method')
nodes = request.GET.get('nodes')
depth = request.GET.get('depth')
keyword = request.GET.get('keyword')
webcrawler = WebCrawler(source, method, nodes, depth, keyword)
data = webcrawler.jsonSerialize()
return JsonResponse(data, safe=False)
私jsonSerialize()
関数は次のようになります。
def jsonSerialize(self):
for n in self.graph.nodes:
n.sourceNodes = []
self.graph.edges = list(self.graph.edges)
return json.dumps(self.graph, default=lambda o: o.__dict__)
は、なぜ私は戻って、この応答からコピーの乱数を取得するのでしょうか?毎回新しいリクエストを作成し、毎回新しいオブジェクトを作成していますが、返されるオブジェクトの数が増え、時折ダウンしてしまうようです。新しいタブを開いて新しいリクエストを行うと、同じことが起こります。おそらくこれの原因は何でしょうか?
{
"nodes": [
{
"keyword": false,
"url": "http://www.google.com",
"sourceNodes": [],
"title": "Google",
"index": 0
},
{
"keyword": false,
"url": "http://www.google.com/imghp?hl=en&tab=wi",
"sourceNodes": [],
"title": "Google Images",
"index": 4
},
{
"keyword": false,
"url": "http://www.youtube.com/?tab=w1",
"sourceNodes": [],
"title": "YouTube",
"index": 3
},
{
"keyword": false,
"url": "http://news.google.com/nwshp?hl=en&tab=wn",
"sourceNodes": [],
"title": "Google News",
"index": 2
},
{
"keyword": false,
"url": "http://maps.google.com/maps?hl=en&tab=wl",
"sourceNodes": [],
"title": "Google Maps",
"index": 1
}
],
"edges": [
{
"source": 0,
"target": 1
},
{
"source": 0,
"target": 3
},
{
"source": 0,
"target": 2
},
{
"source": 0,
"target": 4
}
]
}
をそして、時にはそれが重複したノードとエッジと...このようになります:私は同じ要求を作り続ける場合
例えば、webcrawler?source=http://www.google.com&method=BFS&nodes=5&depth=0&keyword=google
は、時にはそれがこのようになります。
{
"nodes": [
{
"keyword": false,
"url": "https://mail.google.com/mail/?tab=wm",
"sourceNodes": [],
"title": "Gmail",
"index": 1
},
{
"keyword": false,
"url": "https://www.google.com/intl/en/options/",
"sourceNodes": [],
"title": "\n Our Products | Google\n ",
"index": 7
},
{
"keyword": false,
"url": "http://www.google.com/imghp?hl=en&tab=wi",
"sourceNodes": [],
"title": "Google Images",
"index": 6
},
{
"keyword": false,
"url": "https://drive.google.com/?tab=wo",
"sourceNodes": [],
"title": "Meet Google Drive \u2013 One place for all your files",
"index": 2
},
{
"keyword": false,
"url": "http://news.google.com/nwshp?hl=en&tab=wn",
"sourceNodes": [],
"title": "Google News",
"index": 8
},
{
"keyword": false,
"url": "http://maps.google.com/maps?hl=en&tab=wl",
"sourceNodes": [],
"title": "Google Maps",
"index": 3
},
{
"keyword": true,
"url": "https://play.google.com/?hl=en&tab=w8",
"sourceNodes": [],
"title": "Google Play",
"index": 9
},
{
"keyword": false,
"url": "https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=http://www.google.com/",
"sourceNodes": [],
"title": "Sign in - Google Accounts",
"index": 4
},
{
"keyword": false,
"url": "http://www.google.com",
"sourceNodes": [],
"title": "Google",
"index": 0
},
{
"keyword": false,
"url": "http://www.google.com/history/optout?hl=en",
"sourceNodes": [],
"title": " Google - Search Customization ",
"index": 5
},
{
"keyword": false,
"url": "http://www.youtube.com/?tab=w1",
"sourceNodes": [],
"title": "YouTube",
"index": 3
}
],
"edges": [
{
"source": 0,
"target": 1
},
{
"source": 0,
"target": 7
},
{
"source": 0,
"target": 6
},
{
"source": 0,
"target": 5
},
{
"source": 0,
"target": 4
},
{
"source": 0,
"target": 9
},
{
"source": 0,
"target": 3
},
{
"source": 0,
"target": 8
},
{
"source": 0,
"target": 2
}
]
}
私は同じ引数を複数回使用してローカルでpythonスクリプトを実行すると、このような悪影響は見られません。これは、スクリプトが正しく動作しているように見えるので、/ Django/JsonResponseリクエストと関係があると私に思います。
は、ここに私のWebクローラー用のコードです:
import urllib
# from urllib.request import urlopen
from urlparse import urlparse
from bs4 import BeautifulSoup
import requests
import collections
from Graph import Graph
from Node import Node
import sys
from time import gmtime, strftime
from timeout import timeout
from multiprocessing import Pool
from multiprocessing import Process
import json
import re
import pdb
class WebCrawler:
def __init__(self, originUrl, method, totalNodes, depthLimit=None, keyword=None):
self.originUrl = originUrl
self.method = method
self.totalNodes = int(totalNodes)
self.nodeCount = 0
self.depthLimit = int(depthLimit)
self.currentDepth = 0
self.keyword = keyword
self.keywordUrls = []
self.nodeUrlMap = {}
self.nodesToVisit = []
self.visitedUrls = set()
self.graph = Graph()
self.nodeIndex = 0
self.storeCookie()
originTitle = self.getTitle(originUrl)
startNode = Node(originUrl, None, originTitle)
self.crawl(startNode)
def crawl(self, node):
print("crawl(): " + strftime("%H:%M:%S", gmtime()))
visited = node.url in self.visitedUrls
if not visited:
self.graph.addNode(node, self.nodeIndex)
self.nodeIndex += 1
self.nodeCount += 1
self.visitedUrls.add(node.url)
if node.sourceNodes: # If this is not the starting node
sourceNode = node.sourceNodes.pop()
if sourceNode.index is not None and node.index is not None:
self.graph.addEdge(sourceNode.index, node.index) # Add an edge between sourceNode and node
if not visited:
soup = self.generateSoup(node.url)
hasKeyword = self.checkForKeyword(soup, node.url)
if hasKeyword:
node.keyword = True
links = self.findLinks(soup)
links = self.validLinks(links)
links = {l for l in links} # Remove duplicate links
if links:
if self.method == "BFS":
self.bfs(node, links)
else: # DFS
self.currentDepth += 1
if self.currentDepth >= self.depthLimit: # If depth limit reached, getNextNode (up a level)
self.currentDepth = 0 # Reset currentDepth
self.getNextNode()
else: # Otherwise, keep going deeper
self.dfs(node, links)
else: # No links present
self.getNextNode()
else: # Avoid infinite loop
self.getNextNode()
def validLinks(self, links):
print("validLinks(): " + strftime("%H:%M:%S", gmtime()))
validLinks = []
for link in links:
# Only add links while there is still room
if self.nodeCount + len(validLinks) <= self.totalNodes:
if self.isValidUrl(link):
validLinks.append(link)
return validLinks
def isValidUrl(self, url):
print("isValidUrl(): " + strftime("%H:%M:%S", gmtime()))
extensionBlacklist = ["zip", "dmg", "msi", "tar", "exe", "sisx"]
for x in extensionBlacklist:
if x in url:
return False
if "http" not in url: return False
parsed_url = urlparse(url)
if not bool(parsed_url.scheme): return False
try:
self.testRequest(url)
except:
return False
return True
@timeout(1)
def testRequest(self, url):
requests.head(url)
def getNextNode(self):
print("getNextNode(): " + strftime("%H:%M:%S", gmtime()))
if len(self.nodesToVisit) is not 0 and not self.nodeLimitReached():
# We use the same data structure to store urlsToVisit for BFS and DFS,
# and pop elements off the same way. How the elements are added is
# what's important.
nextNode = self.nodesToVisit.pop()
self.crawl(nextNode)
else: # Crawl is over
return
def printGraph(self):
for node in self.graph.nodes:
print("\nNode:")
if node.title:
print("Index: " + str(node.index))
print("Title: " + node.title)
print("URL: " + node.url)
print("Keyword: " + str(node.keyword))
if self.graph.edges:
print("\nEdges:")
edgeCount = 0
for e in self.graph.edges:
print("Source: " + str(e.source) + " Target: " + str(e.target))
if self.keywordUrls:
print("\nKeyword URLs:")
for k in self.keywordUrls:
print("URL: " + k)
print("\nJSON:")
print(self.jsonSerialize())
def jsonSerialize(self):
for n in self.graph.nodes:
n.sourceNodes = []
self.graph.edges = list(self.graph.edges)
self.graph.nodes = list(self.graph.nodes)
return json.dumps(self.graph, default=lambda o: o.__dict__)
def storeCookie(self):
# Store graph as cookie (do this one)
pass
def nodeLimitReached(self):
return self.nodeCount >= self.totalNodes
# Convert URL into soup
def generateSoup(self, url):
print("generateSoup(): " + strftime("%H:%M:%S", gmtime()))
sourceCode = requests.get(url)
plainText = sourceCode.text
soup = BeautifulSoup(plainText, "html.parser")
return soup
# Parse soup to find links
def findLinks(self, soup):
print("findLinks(): " + strftime("%H:%M:%S", gmtime()))
links = soup.findAll('a')
hrefs = []
for link in links:
href = link.get('href', '')
hrefs.append(href)
return hrefs
def getTitle(self, url):
print("getTitle(): " + strftime("%H:%M:%S", gmtime()))
soup = self.generateSoup(url)
title = soup.title # same as soup.find("title")
if title is not None:
return title.get_text()
def bfs(self, currentNode, links):
print("bfs(): " + strftime("%H:%M:%S", gmtime()))
for link in links:
# If url is not already visited, and nodesToVisit+nodeCount hasn't exceeded totalNodes
if link not in self.visitedUrls and self.nodeCount + len(self.nodesToVisit) <= self.totalNodes:
title = self.getTitle(link)
newNode = Node(link, [currentNode], title)
newNode.sourceNodes.insert(0, currentNode)
self.nodesToVisit.insert(0, newNode)
self.nodeUrlMap[link] = newNode
elif link in self.nodeUrlMap: # Repeat URL, get existing node
existingNode = self.nodeUrlMap[link]
existingNode.sourceNodes.insert(0, currentNode)
self.nodesToVisit.insert(0, existingNode)
self.getNextNode()
def dfs(self, currentNode, links):
print("dfs(): " + strftime("%H:%M:%S", gmtime()))
for link in links:
if link not in self.visitedUrls:
title = self.getTitle(link)
newNode = Node(link, [currentNode], title)
newNode.sourceNodes.append(currentNode)
self.nodesToVisit.append(newNode)
elif link in self.nodeUrlMap: # Repeat URL, get existing node
existingNode = self.nodeUrlMap[link]
existingNode.sourceNodes.append(currentNode)
self.nodesToVisit.append(existingNode)
self.getNextNode()
def checkForKeyword(self, soup, url):
if self.keyword != "":
# If keyword found in soup, append url to keywordUrls
if soup.body and soup.body.find_all(string=re.compile('.*{0}.*'.format(self.keyword)), recursive=True):
self.keywordUrls.append(url)
return True
jsonの例を貼り付けることはできますか?どのようにコピーされますか? – Udi
@UdiいくつかのJSONの例を追加しました。どのようにコピーされているのですか? – 123
重複は見られません - それは別の結果です。 – Udi