2011-10-19 11 views
14

目標が説明するようにHTTP範囲メソッドを使用して、ちょうど全体のコンテンツをダウンロードすることなく、特定のファイルを取得することです: http://www.codeproject.com/KB/cs/remotezip.aspxリモートジップからファイルを取得するためのライブラリはありますか?

+1

そして、ここではコードです。ところで、いい質問です。 – Blender

+0

どのプロトコルを使用したいですか? ssh(scp)、ftp、sftp、http、..? – chown

+0

HTTPでは、質問に記載されているように、HTTP範囲要求を使用する必要があります。 –

答えて

16

あなたがより少ないコードでもう少し一般的にこの問題を解決することができます。基本的には、ZipFileで使用するのに十分なファイル形式のオブジェクトを作成します。だから、z = ZipFile(HttpFile(url))で終わり、必要な部分だけを動的にダウンロードします。この利点は、コードを書くことが少ないことであり、zipファイル以外のものにも当てはまります。 (実際にはこのようなものが既に存在するのだろうかと思っています...)

同じアイデアを使用して、HttpFileのキャッシュラッパーを作成して繰り返しダウンロードすることを避けることもできます。 (エラー処理の欠如に注意してください)メソッドは明らかにサブディレクトリを考慮していないこと

#!/usr/bin/python 
import urllib2 

class HttpFile(object): 
    def __init__(self, url): 
     self.url = url 
     self.offset = 0 
     self._size = -1 

    def size(self): 
     if self._size < 0: 
      f = urllib2.urlopen(self.url) 
      self._size = int(f.headers["Content-length"]) 
     return self._size 

    def read(self, count=-1): 
     req = urllib2.Request(self.url) 
     if count < 0: 
      end = self.size() - 1 
     else: 
      end = self.offset + count - 1 
     req.headers['Range'] = "bytes=%s-%s" % (self.offset, end) 
     f = urllib2.urlopen(req) 
     data = f.read() 
     # FIXME: should check that we got the range expected, etc. 
     chunk = len(data) 
     if count >= 0: 
      assert chunk == count 
     self.offset += chunk 
     return data 

    def seek(self, offset, whence=0): 
     if whence == 0: 
      self.offset = offset 
     elif whence == 1: 
      self.offset += offset 
     elif whence == 2: 
      self.offset = self.size() + offset 
     else: 
      raise Exception("Invalid whence") 

    def tell(self): 
     return self.offset 
+0

優れた実装。ありがとう –

6

私は小さなモジュールを自分で書かれているそのようなライブラリがなかったので、ほとんどのコードとロジックがされzipファイルからのもので、seek/readsをHTTP範囲要求に変換したものです。

は、改善を検討し、提案すること自由に感じ:

コードを:

""" 
Read remote ZIP files using HTTP range requests 
""" 
import struct 
import urllib2 
import zlib 
import cStringIO 
from zipfile import ZipInfo, ZipExtFile, ZipInfo 
from os.path import join, basename 

# The code is mostly adatpted from the zipfile module 
# NOTE: ZIP64 is not supported 

# The "end of central directory" structure, magic number, size, and indices 
# (section V.I in the format document) 
structEndArchive = "<4s4H2LH" 
stringEndArchive = "PK\005\006" 
sizeEndCentDir = struct.calcsize(structEndArchive) 

_ECD_SIGNATURE = 0 
_ECD_DISK_NUMBER = 1 
_ECD_DISK_START = 2 
_ECD_ENTRIES_THIS_DISK = 3 
_ECD_ENTRIES_TOTAL = 4 
_ECD_SIZE = 5 
_ECD_OFFSET = 6 
_ECD_COMMENT_SIZE = 7 
# These last two indices are not part of the structure as defined in the 
# spec, but they are used internally by this module as a convenience 
_ECD_COMMENT = 8 
_ECD_LOCATION = 9 

# The "central directory" structure, magic number, size, and indices 
# of entries in the structure (section V.F in the format document) 
structCentralDir = "<4s4B4HL2L5H2L" 
stringCentralDir = "PK\001\002" 
sizeCentralDir = struct.calcsize(structCentralDir) 

# indexes of entries in the central directory structure 
_CD_SIGNATURE = 0 
_CD_CREATE_VERSION = 1 
_CD_CREATE_SYSTEM = 2 
_CD_EXTRACT_VERSION = 3 
_CD_EXTRACT_SYSTEM = 4 
_CD_FLAG_BITS = 5 
_CD_COMPRESS_TYPE = 6 
_CD_TIME = 7 
_CD_DATE = 8 
_CD_CRC = 9 
_CD_COMPRESSED_SIZE = 10 
_CD_UNCOMPRESSED_SIZE = 11 
_CD_FILENAME_LENGTH = 12 
_CD_EXTRA_FIELD_LENGTH = 13 
_CD_COMMENT_LENGTH = 14 
_CD_DISK_NUMBER_START = 15 
_CD_INTERNAL_FILE_ATTRIBUTES = 16 
_CD_EXTERNAL_FILE_ATTRIBUTES = 17 
_CD_LOCAL_HEADER_OFFSET = 18 

# The "local file header" structure, magic number, size, and indices 
# (section V.A in the format document) 
structFileHeader = "<4s2B4HL2L2H" 
stringFileHeader = "PK\003\004" 
sizeFileHeader = struct.calcsize(structFileHeader) 

_FH_SIGNATURE = 0 
_FH_EXTRACT_VERSION = 1 
_FH_EXTRACT_SYSTEM = 2 
_FH_GENERAL_PURPOSE_FLAG_BITS = 3 
_FH_COMPRESSION_METHOD = 4 
_FH_LAST_MOD_TIME = 5 
_FH_LAST_MOD_DATE = 6 
_FH_CRC = 7 
_FH_COMPRESSED_SIZE = 8 
_FH_UNCOMPRESSED_SIZE = 9 
_FH_FILENAME_LENGTH = 10 
_FH_EXTRA_FIELD_LENGTH = 11 


def _http_get_partial_data(url, start_range, end_range=None): 
    req = urllib2.Request(url) 
    range_header = "bytes=%s" % start_range 
    if end_range is not None: 
     range_header += "-%s" % end_range 
    req.headers['Range'] = range_header 
    f = urllib2.urlopen(req)  
    return f 


def _EndRecData(url): 
    """Return data from the "End of Central Directory" record, or None. 

    The data is a list of the nine items in the ZIP "End of central dir" 
    record followed by a tenth item, the file seek offset of this record.""" 
    ECD = _http_get_partial_data(url, -sizeEndCentDir) 
    content_range = ECD.headers.get('Content-Range') 
    filesize = int(content_range.split('/')[1]) if content_range and '/' in content_range else 0 
    data = ECD.read() 
    ECD.close() 
    if data[0:4] == stringEndArchive and data[-2:] == "\000\000": 
     # the signature is correct and there's no comment, unpack structure 
     endrec = struct.unpack(structEndArchive, data) 
     endrec = list(endrec) 

     # Append a blank comment and record start offset 
     endrec.append("") 
     endrec.append(filesize - sizeEndCentDir) 
     return endrec 
    # Either this is not a ZIP file, or it is a ZIP file with an archive 
    # comment. Search the end of the file for the "end of central directory" 
    # record signature. The comment is the last item in the ZIP file and may be 
    # up to 64K long. It is assumed that the "end of central directory" magic 
    # number does not appear in the comment. 

    # Search by retrieving chunks of 256, 1k and 64k 
    try_ranges = (1 << 8, 1 << 10, 1 << 16) 
    for check_range in try_ranges: 
     ECD = _http_get_partial_data(url, -(check_range + sizeEndCentDir))  
     data = ECD.read()  
     content_range = ECD.headers.get('Content-Range')  
     ECD.close() 
     download_start = content_range.split('-')[0] 
     start = data.rfind(stringEndArchive)   
     if start >= 0:   
      # found the magic number; attempt to unpack and interpret 
      recData = data[start:start+sizeEndCentDir] 
      endrec = list(struct.unpack(structEndArchive, recData)) 
      commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file 
      comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize] 
      endrec.append(comment) 
      endrec.append(download_start + start)   
      return endrec 

    raise IOError 


class HTTPZipFile: 
    def __init__(self, url): 
     self.url = url 
     self.NameToInfo = {} # Find file info given name 
     self.filelist = []  # List of ZipInfo instances for archive 
     self.pwd = None 
     self.comment = '' 
     self.debug = 0 
     self._RealGetContents()  

    def _RealGetContents(self): 
     """Read in the table of contents for the ZIP file.""" 
     try: 
      endrec = _EndRecData(self.url) 
     except IOError: 
      raise BadZipfile("File is not a zip file") 
     if not endrec: 
      raise BadZipfile, "File is not a zip file" 
     if self.debug > 1: 
      print endrec 
     size_cd = endrec[_ECD_SIZE]    # bytes in central directory 
     offset_cd = endrec[_ECD_OFFSET]   # offset of central directory 
     self.comment = endrec[_ECD_COMMENT]  # archive comment 

     # "concat" is zero, unless zip was concatenated to another file 
     concat = endrec[_ECD_LOCATION] - size_cd - offset_cd 
     #if endrec[_ECD_SIGNATURE] == stringEndArchive64: 
     # # If Zip64 extension structures are present, account for them 
     # concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) 

     if self.debug > 2: 
      inferred = concat + offset_cd 
      print "given, inferred, offset", offset_cd, inferred, concat 
     # self.start_dir: Position of start of central directory 
     self.start_dir = offset_cd + concat 
     ECD = _http_get_partial_data(self.url, self.start_dir, self.start_dir+size_cd-1) 
     data = ECD.read() 
     ECD.close() 
     fp = cStringIO.StringIO(data)    
     total = 0 
     while total < size_cd: 
      centdir = fp.read(sizeCentralDir) 
      if centdir[0:4] != stringCentralDir: 
       raise BadZipfile, "Bad magic number for central directory" 
      centdir = struct.unpack(structCentralDir, centdir) 
      if self.debug > 2: 
       print centdir 
      filename = fp.read(centdir[_CD_FILENAME_LENGTH]) 
      # Create ZipInfo instance to store file information 
      x = ZipInfo(filename) 
      x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) 
      x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) 
      x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] 
      (x.create_version, x.create_system, x.extract_version, x.reserved, 
       x.flag_bits, x.compress_type, t, d, 
       x.CRC, x.compress_size, x.file_size) = centdir[1:12] 
      x.volume, x.internal_attr, x.external_attr = centdir[15:18] 
      # Convert date/time code to (year, month, day, hour, min, sec) 
      x._raw_time = t 
      x.date_time = ((d>>9)+1980, (d>>5)&0xF, d&0x1F, 
            t>>11, (t>>5)&0x3F, (t&0x1F) * 2) 

      x._decodeExtra() 
      x.header_offset = x.header_offset + concat 
      x.filename = x._decodeFilename() 
      self.filelist.append(x) 
      self.NameToInfo[x.filename] = x 

      # update total bytes read from central directory 
      total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] 
        + centdir[_CD_EXTRA_FIELD_LENGTH] 
        + centdir[_CD_COMMENT_LENGTH]) 

     if self.debug > 2: 
      print "total", total 

    def namelist(self): 
     """Return a list of file names in the archive.""" 
     l = [] 
     for data in self.filelist: 
      l.append(data.filename) 
     return l 

    def infolist(self): 
     """Return a list of class ZipInfo instances for files in the 
     archive.""" 
     return self.filelist 

    def printdir(self): 
     """Print a table of contents for the zip file.""" 
     print "%-46s %19s %12s" % ("File Name", "Modified ", "Size") 
     for zinfo in self.filelist: 
      date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6] 
      print "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size) 

    def getinfo(self, name): 
     """Return the instance of ZipInfo given 'name'.""" 
     info = self.NameToInfo.get(name) 
     if info is None: 
      raise KeyError(
       'There is no item named %r in the archive' % name) 

     return info   

    def open(self, name, pwd=None): 
     """Return file-like object for 'name'.""" 
     if not self.url: 
      raise RuntimeError, \ 
        "Attempt to read ZIP archive that was already closed" 
     zinfo = self.getinfo(name) 
     offset = zinfo.header_offset 
     f = _http_get_partial_data(self.url, offset, offset+sizeFileHeader-1) 
     fheader = f.read() 
     f.close() 

     fheader = struct.unpack(structFileHeader, fheader) 
     offset += sizeFileHeader 
     f = _http_get_partial_data(self.url, offset, offset+fheader[_FH_FILENAME_LENGTH]-1) 
     fname = f.read() 
     f.close() 

     if fname != zinfo.orig_filename: 
      raise BadZipfile, \ 
         'File name in directory "%s" and header "%s" differ.' % (
          zinfo.orig_filename, fname) 

     is_encrypted = zinfo.flag_bits & 0x1 
     if is_encrypted: 
      raise RuntimeError, "File %s is encrypted, " \ 
        "not supported." % name 

     offset += fheader[_FH_FILENAME_LENGTH]+fheader[_FH_EXTRA_FIELD_LENGTH] 
     f = _http_get_partial_data(self.url, offset, offset+fheader[_FH_COMPRESSED_SIZE]-1) 
     data = f.read() 
     return ZipExtFile(cStringIO.StringIO(data), 'r', zinfo) 


if __name__ == "__main__": 
    # Some tests 
    link="http://dfn.dl.sourceforge.net/project/filezilla/FileZilla_Client/3.5.1/FileZilla_3.5.1_win32.zip" 
    hzfile = HTTPZipFile(link) 
    hzfile.printdir() 
    for fname in ('GPL.html', 'resources/blukis/48x48/filter.png', 'resources/finished.wav'): 
     source_name = join('FileZilla-3.5.1', fname) 
     dest_fname = join('/tmp', basename(fname)) 
     print "Extracing %s to %s" % (source_name, dest_fname) 
     with hzfile.open(source_name) as f: 
      data = f.read() 
      new_file = open(dest_fname, 'w') 
      new_file.write(data) 
      new_file.close() 
関連する問題