2017-10-21 6 views

私はNSIDC httpsウェブサイトからSMAP衛星データを検索してダウンロードするためにPythonコードで作業しています。私のコードは先週バグを始めるまで働いていました。https NSIDC/NASAウェブサイトへの接続を修正するには?

urllib2.HTTPError: HTTP Error 404: Not Found



"""This script, NSIDC_parse_HTML_BatchDL.py, defines an HTML parser to scrape data files from an earthdata HTTPS URL and bulk downloads all files to your working directory. 

This code was adapted from https://wiki.earthdata.nasa.gov/display/EL/How+To+Access+Data+With+Python 
Last edited Jan 26, 2017 G. Deemer""" 

import urllib2 
import os 
from cookielib import CookieJar 
from HTMLParser import HTMLParser 

# Define a custom HTML parser to scrape the contents of the HTML data table 
class MyHTMLParser(HTMLParser): 
    def __init__(self): 
     self.inLink = False 
     self.dataList = [] 
     self.directory = '/' 
     self.indexcol = ';' 
     self.Counter = 0 

    def handle_starttag(self, tag, attrs): 
     self.inLink = False 
     if tag == 'table': 
      self.Counter += 1 
     if tag == 'a': 
      for name, value in attrs: 
       if name == 'href': 
        if self.directory in value or self.indexcol in value: 
         self.inLink = True 
         self.lasttag = tag 

    def handle_endtag(self, tag): 
      if tag == 'table': 
       self.Counter +=1 

    def handle_data(self, data): 
     if self.Counter == 1: 
      if self.lasttag == 'a' and self.inLink and data.strip(): 

parser = MyHTMLParser() 

# Define function for batch downloading 
def BatchJob(Files, cookie_jar): 
    for dat in Files: 
     print "downloading: ", dat 
     JobRequest = urllib2.Request(url+dat) 
     JobRequest.add_header('cookie', cookie_jar) # Pass the saved cookie into additional HTTP request 
     JobRedirect_url = urllib2.urlopen(JobRequest).geturl() + '&app_type=401' 

     # Request the resource at the modified redirect url 
     Request = urllib2.Request(JobRedirect_url) 
     Response = urllib2.urlopen(Request) 
     f = open(dat, 'wb') 
    print "Files downloaded to: ", os.path.dirname(os.path.realpath(__file__)) 
# The following code block is used for HTTPS authentication 

# The user credentials that will be used to authenticate access to the data 
username = "user" 
password = "password" 

# The FULL url of the directory which contains the files you would like to bulk download 

url = "https://n5eil01u.ecs.nsidc.org/SMAP/SPL4SMGP.003/2017.10.14/" # Example URL 
# Create a password manager to deal with the 401 reponse that is returned from 
# Earthdata Login 

password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() 
password_manager.add_password(None, "https://urs.earthdata.nasa.gov", username, password) 

# Create a cookie jar for storing cookies. This is used to store and return 
# the session cookie given to use by the data server (otherwise it will just 
# keep sending us back to Earthdata Login to authenticate). Ideally, we 
# should use a file based cookie jar to preserve cookies between runs. This 
# will make it much more efficient. 

cookie_jar = CookieJar() 

# Install all the handlers. 
opener = urllib2.build_opener(
    #urllib2.HTTPHandler(debuglevel=1), # Uncomment these two lines to see 
    #urllib2.HTTPSHandler(debuglevel=1), # details of the requests/responses 

# Create and submit the requests. There are a wide range of exceptions that 
# can be thrown here, including HTTPError and URLError. These should be 
# caught and handled. 

# Open a requeset to grab filenames within a directory. Print optional 

DirRequest = urllib2.Request(url) 
DirResponse = urllib2.urlopen(DirRequest) 

# Get the redirect url and append 'app_type=401' 
# to do basic http auth 
DirRedirect_url = DirResponse.geturl() 
DirRedirect_url += '&app_type=401' 

# Request the resource at the modified redirect url 
DirRequest = urllib2.Request(DirRedirect_url) 
DirResponse = urllib2.urlopen(DirRequest) 

DirBody = DirResponse.read(DirResponse) 

# Uses the HTML parser defined above to pring the content of the directory containing data 
Files = parser.dataList 

# Display the contents of the python list declared in the HTMLParser class 
# print Files #Uncomment to print a list of the files 

# Call the function to download all files in url 

BatchJob(Files, cookie_jar) # Comment out to prevent downloading to your working directory 

なぜあなたのリクエストに 'url'を使用しないのですか? 'DirRedirect_url'は404応答を返します。 –


@ t.m.adamコードはURLベースを使用してウェブサイトの各フォルダ内のファイルを検索するためです。これには、ウェブサイトでのチェック、このファイルの入手とデータのダウンロードがあります。 –




"""This script, NSIDC_parse_HTML_BatchDL.py, defines an HTML parser to scrape data files from an earthdata HTTPS URL and bulk downloads all files to your working directory. 

This code was adapted from https://wiki.earthdata.nasa.gov/display/EL/How+To+Access+Data+With+Python Last edited Jan 26, 2017 G. Deemer""" 

import urllib2 
import os 
from cookielib import CookieJar 

# Define function for batch downloading 
def BatchJob(Files, cookie_jar): 
    for dat in Files: 
     print "downloading: ", dat 
     JobRequest = urllib2.Request(url+dat) 
     JobRequest.add_header('cookie', cookie_jar) # Pass the saved cookie into additional HTTP request 
    JobRedirect_url = urllib2.urlopen(JobRequest).geturl() + '&app_type=401' 

    # Request the resource at the modified redirect url 
    Request = urllib2.Request(JobRedirect_url) 
    Response = urllib2.urlopen(Request) 
    f = open(dat, 'wb') 
print "Files downloaded to: ", os.path.dirname(os.path.realpath(__file__)) 
# The following code block is used for HTTPS authentication 

# The user credentials that will be used to authenticate access to the data 
username = "user" 
password = "password" 

# The FULL url of the directory which contains the files you would like to bulk download 

url = "https://n5eil01u.ecs.nsidc.org/SMAP/SPL4SMGP.003/2017.10.14/" # Example URL 
# Create a password manager to deal with the 401 reponse that is returned from # Earthdata Login 

password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() 
           username, password) 

# Create a cookie jar for storing cookies. This is used to store and return 
# the session cookie given to use by the data server (otherwise it will just 
# keep sending us back to Earthdata Login to authenticate). Ideally, we 
# should use a file based cookie jar to preserve cookies between runs. This 
# will make it much more efficient. 

cookie_jar = CookieJar() 

# Install all the handlers. 
opener = urllib2.build_opener(
    #urllib2.HTTPHandler(debuglevel=1), # Uncomment these two lines to see 
    #urllib2.HTTPSHandler(debuglevel=1), # details of the requests/responses 

# Create and submit the requests. There are a wide range of exceptions that 
# can be thrown here, including HTTPError and URLError. These should be 
# caught and handled. 

# Open a requeset to grab filenames within a directory. Print optional 

DirResponse = urllib2.urlopen(url) 
htmlPage = DirResponse.read() 

listFiles = [x.split(">")[0].replace('"', "") 
        for x in htmlPage.split("><a href=") if x.split(">")[0].endswith('.h5"') == True] 

# Display the contents of the python list declared in the HTMLParser class 
# print Files #Uncomment to print a list of the files 

# Call the function to download all files in url 

BatchJob(Files, cookie_jar) # Comment out to prevent downloading to your working directory 