• Python边学边用BT客户端实现之(一)BitTorrent文件解析


    BitTorrent文件解析:

    BitTorrent文件使用bencode编码,其中包括了4种数据类型:

    'd' 开头表示是dict类型,'e'表示结束

    'l' (小写字母L)开头表示是list类型,'e'表示结束

    'i'开头表示是integer类型,'e'表示结束,可以表示负数

    以数字开头表示string类型,数字为string长度,长度与string内容以':'分割

    默认所有text类型的属性为utf-8编码,但是大多数BitTorrent包含codepage 和 encoding属性,指定了text的编码格式

    "announce" -- tracker服务器的地址,为string
    "info" ---文件信息,为dict类型
      "name" --单文件模式,表示文件名,多文件模式表示根目录名。
      "length" --单文件模式表示文件长度,多文件模式不存在
      "piece length" --文件分片大小
      "pieces" --为一个长string, 没20个字节表示一个分片的SHA1 hash值。按照文件分片的顺序排列。
            分片是按照所以文件组合在一起进行的,即一个分片可能会跨越多个文件。
      "files" -- 多文件模式存在,为一个文件列表,每个文件为一个dict类型
          "path" -- 文件目录列表,最后一项为文件名
          "length" --文件长度

    "peace length"  --分片大小

    以下为draft bep定义的属性
    "code page"
    "announce-list" --tracker列表,为二维数组,即将tracker服务器分为多个组
    "encoding" -- Text属性的编码类型,string 类型,如 UTF-8
    "publisher" -- 发布者
    "publisher url" --发布者 URL
    "creater" --创建者,如btcomet,btspirit
    "creation date" --创建日期,为UTC格式,需要转化为本地时区可读格式
    "commnent" --注释
    "nodes" -- DHT 节点列表

    BitTorrent的标准参见:http://www.bittorrent.org/beps/bep_0003.html

    以下是自己写的Python实现,初学Python,代码写起来还都是C/C++风格,慢慢改进吧。

     修改代码,bittorrent文件的解码使用异常处理解决文件格式错误的情况,简化处理过程。

    bcodec
      1 '''
      2 Created on 2012-9-30
      3 
      4 @author: ddt
      5 '''
      6 class DataEncodedError(BaseException):
      7     def __str__(self):
      8         return 'Data Encoded Error'
      9 
     10 class DataTypeError(BaseException):
     11     def __str__(self):
     12         return 'Data Type Error'
     13     
     14 def bdecode(data):
     15     try:
     16         leading_chr = data[0]
     17         #print leading_chr,                 
     18         if leading_chr.isdigit():
     19             chunk, length = _read_string(data)
     20             #print chunk
     21         elif leading_chr == 'd':
     22             chunk, length = _read_dict(data)
     23             #print chunk is None
     24         elif leading_chr == 'i':
     25             chunk, length = _read_integer(data)
     26             #print chunk
     27         elif leading_chr == 'l':
     28             chunk, length = _read_list(data)
     29         else:
     30             raise DataEncodedError()
     31         return chunk, length
     32     except:
     33         raise DataEncodedError()
     34     
     35                            
     36 def _read_dict(data):
     37     chunk = {} 
     38     length = 1
     39     
     40     while data[length] != 'e':
     41         key, key_len = bdecode(data[length:])
     42         length += key_len
     43         
     44         value, value_len = bdecode(data[length:])
     45         length += value_len
     46         
     47         chunk[key] = value
     48         #print key
     49         
     50     length += 1
     51     return chunk, length
     52 
     53 def _read_list(data):
     54     chunk = []
     55     length = 1
     56     while data[length] != 'e':
     57         value, value_len = bdecode(data[length:])
     58         chunk.append(value)
     59         length += value_len  
     60         
     61     length += 1
     62     return chunk, length
     63 
     64 def _read_string(data):
     65     comm_index = data.find(':')
     66     str_len = int(data[:comm_index])
     67     value = data[comm_index+1:comm_index+1+str_len]
     68     
     69     length = comm_index + 1 + str_len
     70     return ''.join(value), length
     71 
     72 def _read_integer(data):
     73 
     74     end_index = data.find('e')
     75     value = int(data[1:end_index])
     76     length = end_index + 1
     77     
     78     return  value, length
     79 
     80 def bencode(data):
     81     data_type = type(data)
     82     if data_type == type({}):
     83         result = _write_dict(data)
     84     elif data_type == type([]):
     85         result = _write_list(data)
     86     elif data_type == type(''):
     87         result = _write_string(data)
     88     elif data_type == type(int(0)):
     89         result = _write_integer(data)
     90     else:
     91         raise DataTypeError()
     92     return result
     93 
     94 def _write_dict(data):
     95     result = 'd'
     96     for key, value in data.items():
     97         key_encode = bencode(key)
     98         value_encode = bencode(value)
     99         result += key_encode
    100         result += value_encode
    101 
    102     result += 'e'
    103     return result
    104 
    105 def _write_list(data):
    106     result = 'l'
    107     for value in data:
    108         value_encode = bencode(value)
    109         result += value_encode
    110         
    111     result += 'e'
    112     return result
    113 
    114 def _write_string(data):
    115     return '%d:%s' %(len(data), data)
    116 
    117 def _write_integer(data):
    118     return 'i%de' %data
    119 
    120     
    torrent_file.py
      1 from datetime import datetime
      2 import bcodec
      3 import hashlib
      4 
      5 _READ_MAX_LEN = -1
      6 
      7 class BTFormatError(BaseException):
      8     def __str__(self):
      9         return 'Torrent File Format Error'
     10 
     11 class TorrentFile(object):
     12     
     13     def __init__(self):    
     14         self.__metainfo = {}
     15         self.__file_name = ''
     16         self.__bencode_data = None
     17     
     18     def read_file(self, filename):
     19         
     20         torrent_file = open(filename, 'rb')
     21         data = torrent_file.read(_READ_MAX_LEN)
     22         torrent_file.close()
     23         
     24         try:
     25             metainfo, length = bcodec.bdecode(data)
     26             self.__file_name = filename
     27             self.__metainfo = metainfo
     28             self.__bencode_data = data
     29         except:
     30             raise BTFormatError()
     31         
     32     def __is_singlefile(self):
     33         
     34         return self.__get_meta_info('length') != None
     35     
     36     def __decode_text(self, text):
     37         encoding = 'utf-8'
     38         resultstr = ''
     39         if self.get_encoding() != None:
     40             encoding = self.get_encoding()
     41         elif self.get_codepage() != None:
     42             encoding = 'cp' + str(self.get_codepage())
     43         if text:
     44             try:
     45                 resultstr = text.decode(encoding=encoding)
     46             except ValueError:
     47                 return text
     48         else:
     49             return None
     50         return resultstr
     51     
     52     def __get_meta_top(self, key):
     53         if key in self.__metainfo.keys():
     54             return self.__metainfo[key]
     55         else:
     56             return None
     57     def __get_meta_info(self,key):
     58         meta_info = self.__get_meta_top('info')
     59         if meta_info != None and key in meta_info.keys():
     60                 return meta_info[key]
     61         return None
     62     
     63     def get_codepage(self):
     64         return self.__get_meta_top('codepage')
     65     def get_encoding(self):
     66         return self.__get_meta_top('encoding')
     67     
     68     def get_announces(self):
     69         announces = self.__get_meta_top('announce-list')
     70         if announces != None:
     71             return announces
     72         
     73         announces = [[]]
     74         ann = self.__get_meta_top('announce')
     75         if ann:
     76             announces[0].append(ann)
     77         return announces
     78     
     79     def get_publisher(self):
     80         return self.__decode_text(self.__get_meta_top('publisher'))
     81     def get_publisher_url(self):
     82         return self.__decode_text(self.__get_meta_top('publisher-url'))
     83     
     84     def get_creater(self):
     85         return self.__decode_text(self.__get_meta_top('created by'))
     86     def get_creation_date(self):
     87         utc_date = self.__get_meta_top('creation date')
     88         if utc_date == None:
     89             return utc_date
     90         creationdate = datetime.utcfromtimestamp(utc_date)
     91         return creationdate
     92     def get_comment(self):
     93         return self.__get_meta_top('comment')
     94           
     95     def get_nodes(self):
     96         return self.__get_meta_top('nodes')
     97     
     98     def get_piece_length(self):
     99         return self.__get_meta_info('piece length')
    100     
    101     def get_piece(self, index):
    102         pieces = self.__get_meta_info('pieces')
    103         if pieces == None:
    104             return None
    105         
    106         offset = index*20
    107         if offset+20 > len(pieces):
    108             return None
    109         return pieces[offset:offset+20]
    110     
    111     def get_pieces_num(self):
    112         return len(self.__get_meta_info('pieces'))/20
    113         
    114     def get_files(self):
    115         
    116         files = []
    117         name = self.__decode_text(self.__get_meta_info('name'))
    118         piece_length = self.get_piece_length()
    119         if name == None:
    120             return files
    121         
    122         if self.__is_singlefile():
    123             file_name = name
    124             file_length = self.__get_meta_info('length')
    125             if not file_length:
    126                 return files
    127             
    128             pieces_num = file_length/piece_length
    129             last_piece_offset =  file_length % piece_length
    130             if last_piece_offset != 0:
    131                 pieces_num = int(pieces_num) + 1
    132                 last_piece_offset -= 1
    133             else:
    134                 last_piece_offset = piece_length - 1
    135 
    136             first_piece_offset = 0
    137             
    138             files.append({'name':[file_name], 'length':file_length, 'first-piece':(0, first_piece_offset), 'last-piece':(pieces_num-1,last_piece_offset)})
    139             return files
    140         
    141         folder = name
    142         meta_files = self.__get_meta_info('files')
    143         if meta_files == None:
    144             return files
    145         
    146         total_length = int(0)
    147         for one_file in self.__get_meta_info('files'):
    148             
    149             file_info = {}
    150             path_list = []
    151             path_list.append(folder)
    152                         
    153             if 'path' not in one_file.keys():
    154                 break
    155             for path in one_file['path']:
    156                 path_list.append(self.__decode_text(path))
    157             file_info['name'] = path_list
    158             
    159             if 'length' not in one_file.keys():
    160                 break
    161             
    162             file_info['length'] =  one_file['length']
    163             
    164             piece_index = total_length / piece_length
    165             first_piece_offset =  total_length % piece_length
    166             
    167             total_length += one_file['length']
    168             pieces_num = total_length / piece_length - piece_index
    169             last_piece_offset = total_length % piece_length
    170             
    171             if last_piece_offset != 0:
    172                 pieces_num += 1
    173                 last_piece_offset -= 1
    174             else:
    175                 last_piece_offset = piece_length - 1
    176             
    177             file_info['first-piece'] = (piece_index,first_piece_offset)
    178             file_info['last-piece'] = ((piece_index+pieces_num-1),last_piece_offset)
    179             files.append(file_info)
    180         return files
    181     
    182     def get_info_hash(self):
    183         info_index = self.__bencode_data.find('4:info')
    184         info_data_index = info_index+len('4:info')
    185         
    186         info_value, info_data_len = bcodec.bdecode(self.__bencode_data[info_data_index:])
    187         info_data = self.__bencode_data[info_data_index:info_data_index+info_data_len]
    188         
    189         info_hash = hashlib.sha1()
    190         info_hash.update(info_data)
    191         return info_hash.digest()
    192 
    193     
    194 if __name__ == '__main__':
    195     filename = r".\narodo.torrent"
    196 
    197     torrent = TorrentFile()
    198 
    199     print "begin to read file"
    200     torrent.read_file(filename)
    201 
    202     print "end to read file"
    203 
    204     print "announces: " , torrent.get_announces() 
    205     print "info_hash: ", list(torrent.get_info_hash())
    206     print "peace length:", torrent.get_piece_length()
    207     print "code page:" , torrent.get_codepage()
    208     print "encoding:" , torrent.get_encoding()
    209     print "publisher:" ,torrent.get_publisher()
    210     print "publisher url:", torrent.get_publisher_url()
    211     print "creater:" , torrent.get_creater()
    212     print "creation date:", torrent.get_creation_date()
    213     print "commnent:", torrent.get_comment()
    214     print "nodes:", torrent.get_nodes()
    215     torrent.get_files()
    216     for one_file in torrent.get_files():
    217         print 'name:', '\\'.join(one_file['name'])
    218         print 'length:', one_file['length']
    219         print 'first-piece:', one_file['first-piece']
    220         print 'last-piece:', one_file['last-piece']
  • 相关阅读:
    保险
    cron表达式的用法
    Hive 学习记录
    股票的五种估值方法
    AtCoder Beginner Contest 113 A
    ZOJ 4070 Function and Function
    银行业务队列简单模拟 (数据结构题目)
    算法3-7:银行排队
    算法3-5:n阶Hanoi塔问题
    算法3-1:八进制数
  • 原文地址:https://www.cnblogs.com/piaoliu/p/2708984.html
Copyright © 2020-2023  润新知