私はPythonでコンピュータプログラムを書いていますが、それは私が望むよりも遅く実行されます。ここで私のPythonプログラムをもっと速く走らせる方法はありますか?
はコードです:
from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib
import urllib2
import cookielib
TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
def download(url):
print "Downloading:", url
s = urllib2.urlopen(url).read()
if s[:2] == '\x1f\x8b': # assume it's gzipped data
ifh = GzipFile(mode='rb', fileobj=StringIO(s))
s = ifh.read()
print "Downloaded: ", url
return s
def replace_chars(text, replacements):
return ''.join(replacements.get(x,x) for x in text)
def handle_listing(listing_url):
listing_document = BeautifulSoup(download(listing_url))
# ignore pages that link to yellowpages
if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
listing_title = listing_document.title.text
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':''}
if TITLE_MATCH.match(listing_title) is not None:
title, = TITLE_MATCH.match(listing_title).groups()
address, = ADDRESS_MATCH.match(listing_title).groups()
yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
replace_chars(address, reps),
replace_chars(title, reps),
)
yellow_page = BeautifulSoup(download(yellow_page_url))
page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
if page_url:
page_url = page_url.a["href"]
business_name = title[:title.index(",")]
page = BeautifulSoup(download(page_url))
yellow_page_address = page.find("span", {"class" : "street-address"})
if yellow_page_address:
if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})
final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
pid, page_escaped)
return final_url
def log_in(final_url):
data = urllib.urlencode({"inUserName":"[email protected]", "inUserPass":"secretword"})
jar = cookielib.FileCookieJar("cookies")
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
opener.addheaders.append(('User-agent', 'Mozilla/4.0'))
opener.addheaders.append(('Referer', 'http://www.locationary.com/'))
opener.addheaders.append(('Cookie','site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.1.10.1325009956; __utmc=47547066'))
opener.addheaders.append(('Cookie','Cookie: site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.4.10.1325009956; __utmc=47547066'))
request = urllib2.Request("https://www.locationary.com/index.jsp?ACTION_TOKEN=tile_loginBar_jsp$JspView$LoginAction", data)
response = opener.open(request)
url = str(final_url)
anything = opener.open(url)
page = anything.read()
States = [#'Alabama',
#'Alaska',
'Arizona',
'Arkansas',
'California',
'Colorado',
'Connecticut',
'Delaware',
'Florida',
'Georgia',
'Hawaii',
'Idaho',
'Illinois',
'Indiana',
'Iowa',
'Kansas',
'Kentucky',
'Louisiana',
'Maine',
'Maryland',
'Massachusetts',
'Michigan',
'Minnesota',
'Mississippi',
'Missouri',
'Montana',
'Nebraska',
'Nevada',
'New_Hampshire',
'New_Jersey',
'New_Mexico',
'New_York',
'North_Carolina',
'North_Dakota',
'Ohio',
'Oklahoma',
'Oregon',
'Pennsylvania',
'Rhode_Island',
'South_Carolina',
'South_Dakota',
'Tennessee',
'Texas',
'Utah',
'Vermont',
'Virginia',
'Washington',
'West_Virginia',
'Wisconsin',
'Wyoming']
Cities = []
def find_cities(state):
state_url = 'http://www.locationary.com/place/en/US/' + str(state)
state_document = download(str(state_url))
findCities = re.compile('<b>(.*)</b>')
getCities = re.findall(findCities,state_document)
for City in getCities:
reps = {' ':'_'}
City = replace_chars(City, reps)
Cities.append(str(City))
bestworst = ['0','1']
def main():
for state in States:
find_cities(state)
for city in Cities:
for num in range(0,1):
for pagenum in range(15,16):
print '------------------------------------------------------------------------------------------------------------------------------------------------------------'
print '------------------------------------------------------------------------------------------------------------------------------------------------------------'
if str(num) == '0':
print str(state) + ', ' + str(city) + ', ' + 'Best Profiles' + ', ' + 'Page ' + str(pagenum)
else:
print str(state) + ', ' + str(city) + ', ' + 'Worst Profiles' + ', ' + 'Page ' + str(pagenum)
START_URL = 'http://www.locationary.com/place/en/US/' + str(state) + '/' + city + '-page' + str(pagenum) + '/?ACTION_TOKEN=NumericAction&order=' + str(num)
pool = eventlet.GreenPool()
listings_document = BeautifulSoup(download(START_URL))
listings = listings_document.findAll("a", href = LOCATION_LISTING)
listings = [listing['href'] for listing in listings]
count_listings = 0
for final_url in pool.imap(handle_listing, listings):
print final_url
if final_url is not None:
log_in(final_url)
if __name__ == '__main__':
main()
は、それが速いか、それが不可能であることを確認する方法はありますか?それはインターネットからたくさんのURLをダウンロードする必要がありますが、私はインターネット接続を10〜50倍速くすることができないと確信しています...そして、私のコンピュータは非常に遅くはありません...そうです、そうです私のプログラムを10-50倍速くする方法はありますか?私はそれがばかげて聞こえるかもしれないことを知っていますが、プロのプログラマーはどのようにしてプログラムをより速くするのですか?
http://codereview.stackexchange.com/に属します。 –
複数のスレッドを使用して、異なるページを取得できます。 –
プロのプログラマがプログラムをより速くする方法は、プロファイリングによるものです。 PythonのcProfileモジュールを見てください。 – nmichaels