BitTorrent文件解析:
BitTorrent文件使用bencode编码,其中包括了4种数据类型:
'd' 开头表示是dict类型,'e'表示结束
'l' (小写字母L)开头表示是list类型,'e'表示结束
'i'开头表示是integer类型,'e'表示结束,可以表示负数
以数字开头表示string类型,数字为string长度,长度与string内容以':'分割
默认所有text类型的属性为utf-8编码,但是大多数BitTorrent包含codepage 和 encoding属性,指定了text的编码格式
"announce" -- tracker服务器的地址,为string
"info" ---文件信息,为dict类型
"name" --单文件模式,表示文件名,多文件模式表示根目录名。
"length" --单文件模式表示文件长度,多文件模式不存在
"piece length" --文件分片大小
"pieces" --为一个长string, 没20个字节表示一个分片的SHA1 hash值。按照文件分片的顺序排列。
分片是按照所以文件组合在一起进行的,即一个分片可能会跨越多个文件。
"files" -- 多文件模式存在,为一个文件列表,每个文件为一个dict类型
"path" -- 文件目录列表,最后一项为文件名
"length" --文件长度
"peace length" --分片大小
以下为draft bep定义的属性
"code page"
"announce-list" --tracker列表,为二维数组,即将tracker服务器分为多个组
"encoding" -- Text属性的编码类型,string 类型,如 UTF-8
"publisher" -- 发布者
"publisher url" --发布者 URL
"creater" --创建者,如btcomet,btspirit
"creation date" --创建日期,为UTC格式,需要转化为本地时区可读格式
"commnent" --注释
"nodes" -- DHT 节点列表
BitTorrent的标准参见:http://www.bittorrent.org/beps/bep_0003.html
以下是自己写的Python实现,初学Python,代码写起来还都是C/C++风格,慢慢改进吧。
修改代码,bittorrent文件的解码使用异常处理解决文件格式错误的情况,简化处理过程。
1 ''' 2 Created on 2012-9-30 3 4 @author: ddt 5 ''' 6 class DataEncodedError(BaseException): 7 def __str__(self): 8 return 'Data Encoded Error' 9 10 class DataTypeError(BaseException): 11 def __str__(self): 12 return 'Data Type Error' 13 14 def bdecode(data): 15 try: 16 leading_chr = data[0] 17 #print leading_chr, 18 if leading_chr.isdigit(): 19 chunk, length = _read_string(data) 20 #print chunk 21 elif leading_chr == 'd': 22 chunk, length = _read_dict(data) 23 #print chunk is None 24 elif leading_chr == 'i': 25 chunk, length = _read_integer(data) 26 #print chunk 27 elif leading_chr == 'l': 28 chunk, length = _read_list(data) 29 else: 30 raise DataEncodedError() 31 return chunk, length 32 except: 33 raise DataEncodedError() 34 35 36 def _read_dict(data): 37 chunk = {} 38 length = 1 39 40 while data[length] != 'e': 41 key, key_len = bdecode(data[length:]) 42 length += key_len 43 44 value, value_len = bdecode(data[length:]) 45 length += value_len 46 47 chunk[key] = value 48 #print key 49 50 length += 1 51 return chunk, length 52 53 def _read_list(data): 54 chunk = [] 55 length = 1 56 while data[length] != 'e': 57 value, value_len = bdecode(data[length:]) 58 chunk.append(value) 59 length += value_len 60 61 length += 1 62 return chunk, length 63 64 def _read_string(data): 65 comm_index = data.find(':') 66 str_len = int(data[:comm_index]) 67 value = data[comm_index+1:comm_index+1+str_len] 68 69 length = comm_index + 1 + str_len 70 return ''.join(value), length 71 72 def _read_integer(data): 73 74 end_index = data.find('e') 75 value = int(data[1:end_index]) 76 length = end_index + 1 77 78 return value, length 79 80 def bencode(data): 81 data_type = type(data) 82 if data_type == type({}): 83 result = _write_dict(data) 84 elif data_type == type([]): 85 result = _write_list(data) 86 elif data_type == type(''): 87 result = _write_string(data) 88 elif data_type == type(int(0)): 89 result = _write_integer(data) 90 else: 91 raise DataTypeError() 92 return result 93 94 def _write_dict(data): 95 result = 'd' 96 for key, value in data.items(): 97 key_encode = bencode(key) 98 value_encode = bencode(value) 99 result += key_encode 100 result += value_encode 101 102 result += 'e' 103 return result 104 105 def _write_list(data): 106 result = 'l' 107 for value in data: 108 value_encode = bencode(value) 109 result += value_encode 110 111 result += 'e' 112 return result 113 114 def _write_string(data): 115 return '%d:%s' %(len(data), data) 116 117 def _write_integer(data): 118 return 'i%de' %data 119 120
1 from datetime import datetime 2 import bcodec 3 import hashlib 4 5 _READ_MAX_LEN = -1 6 7 class BTFormatError(BaseException): 8 def __str__(self): 9 return 'Torrent File Format Error' 10 11 class TorrentFile(object): 12 13 def __init__(self): 14 self.__metainfo = {} 15 self.__file_name = '' 16 self.__bencode_data = None 17 18 def read_file(self, filename): 19 20 torrent_file = open(filename, 'rb') 21 data = torrent_file.read(_READ_MAX_LEN) 22 torrent_file.close() 23 24 try: 25 metainfo, length = bcodec.bdecode(data) 26 self.__file_name = filename 27 self.__metainfo = metainfo 28 self.__bencode_data = data 29 except: 30 raise BTFormatError() 31 32 def __is_singlefile(self): 33 34 return self.__get_meta_info('length') != None 35 36 def __decode_text(self, text): 37 encoding = 'utf-8' 38 resultstr = '' 39 if self.get_encoding() != None: 40 encoding = self.get_encoding() 41 elif self.get_codepage() != None: 42 encoding = 'cp' + str(self.get_codepage()) 43 if text: 44 try: 45 resultstr = text.decode(encoding=encoding) 46 except ValueError: 47 return text 48 else: 49 return None 50 return resultstr 51 52 def __get_meta_top(self, key): 53 if key in self.__metainfo.keys(): 54 return self.__metainfo[key] 55 else: 56 return None 57 def __get_meta_info(self,key): 58 meta_info = self.__get_meta_top('info') 59 if meta_info != None and key in meta_info.keys(): 60 return meta_info[key] 61 return None 62 63 def get_codepage(self): 64 return self.__get_meta_top('codepage') 65 def get_encoding(self): 66 return self.__get_meta_top('encoding') 67 68 def get_announces(self): 69 announces = self.__get_meta_top('announce-list') 70 if announces != None: 71 return announces 72 73 announces = [[]] 74 ann = self.__get_meta_top('announce') 75 if ann: 76 announces[0].append(ann) 77 return announces 78 79 def get_publisher(self): 80 return self.__decode_text(self.__get_meta_top('publisher')) 81 def get_publisher_url(self): 82 return self.__decode_text(self.__get_meta_top('publisher-url')) 83 84 def get_creater(self): 85 return self.__decode_text(self.__get_meta_top('created by')) 86 def get_creation_date(self): 87 utc_date = self.__get_meta_top('creation date') 88 if utc_date == None: 89 return utc_date 90 creationdate = datetime.utcfromtimestamp(utc_date) 91 return creationdate 92 def get_comment(self): 93 return self.__get_meta_top('comment') 94 95 def get_nodes(self): 96 return self.__get_meta_top('nodes') 97 98 def get_piece_length(self): 99 return self.__get_meta_info('piece length') 100 101 def get_piece(self, index): 102 pieces = self.__get_meta_info('pieces') 103 if pieces == None: 104 return None 105 106 offset = index*20 107 if offset+20 > len(pieces): 108 return None 109 return pieces[offset:offset+20] 110 111 def get_pieces_num(self): 112 return len(self.__get_meta_info('pieces'))/20 113 114 def get_files(self): 115 116 files = [] 117 name = self.__decode_text(self.__get_meta_info('name')) 118 piece_length = self.get_piece_length() 119 if name == None: 120 return files 121 122 if self.__is_singlefile(): 123 file_name = name 124 file_length = self.__get_meta_info('length') 125 if not file_length: 126 return files 127 128 pieces_num = file_length/piece_length 129 last_piece_offset = file_length % piece_length 130 if last_piece_offset != 0: 131 pieces_num = int(pieces_num) + 1 132 last_piece_offset -= 1 133 else: 134 last_piece_offset = piece_length - 1 135 136 first_piece_offset = 0 137 138 files.append({'name':[file_name], 'length':file_length, 'first-piece':(0, first_piece_offset), 'last-piece':(pieces_num-1,last_piece_offset)}) 139 return files 140 141 folder = name 142 meta_files = self.__get_meta_info('files') 143 if meta_files == None: 144 return files 145 146 total_length = int(0) 147 for one_file in self.__get_meta_info('files'): 148 149 file_info = {} 150 path_list = [] 151 path_list.append(folder) 152 153 if 'path' not in one_file.keys(): 154 break 155 for path in one_file['path']: 156 path_list.append(self.__decode_text(path)) 157 file_info['name'] = path_list 158 159 if 'length' not in one_file.keys(): 160 break 161 162 file_info['length'] = one_file['length'] 163 164 piece_index = total_length / piece_length 165 first_piece_offset = total_length % piece_length 166 167 total_length += one_file['length'] 168 pieces_num = total_length / piece_length - piece_index 169 last_piece_offset = total_length % piece_length 170 171 if last_piece_offset != 0: 172 pieces_num += 1 173 last_piece_offset -= 1 174 else: 175 last_piece_offset = piece_length - 1 176 177 file_info['first-piece'] = (piece_index,first_piece_offset) 178 file_info['last-piece'] = ((piece_index+pieces_num-1),last_piece_offset) 179 files.append(file_info) 180 return files 181 182 def get_info_hash(self): 183 info_index = self.__bencode_data.find('4:info') 184 info_data_index = info_index+len('4:info') 185 186 info_value, info_data_len = bcodec.bdecode(self.__bencode_data[info_data_index:]) 187 info_data = self.__bencode_data[info_data_index:info_data_index+info_data_len] 188 189 info_hash = hashlib.sha1() 190 info_hash.update(info_data) 191 return info_hash.digest() 192 193 194 if __name__ == '__main__': 195 filename = r".\narodo.torrent" 196 197 torrent = TorrentFile() 198 199 print "begin to read file" 200 torrent.read_file(filename) 201 202 print "end to read file" 203 204 print "announces: " , torrent.get_announces() 205 print "info_hash: ", list(torrent.get_info_hash()) 206 print "peace length:", torrent.get_piece_length() 207 print "code page:" , torrent.get_codepage() 208 print "encoding:" , torrent.get_encoding() 209 print "publisher:" ,torrent.get_publisher() 210 print "publisher url:", torrent.get_publisher_url() 211 print "creater:" , torrent.get_creater() 212 print "creation date:", torrent.get_creation_date() 213 print "commnent:", torrent.get_comment() 214 print "nodes:", torrent.get_nodes() 215 torrent.get_files() 216 for one_file in torrent.get_files(): 217 print 'name:', '\\'.join(one_file['name']) 218 print 'length:', one_file['length'] 219 print 'first-piece:', one_file['first-piece'] 220 print 'last-piece:', one_file['last-piece']