|
BitTorrent文件解析:
BitTorrent文件使用bencode编码,其中包括了4种数据类型:
'd' 开头表示是dict类型,'e'表示结束
'l' (小写字母L)开头表示是list类型,'e'表示结束
'i'开头表示是integer类型,'e'表示结束,可以表示负数
以数字开头表示string类型,数字为string长度,长度与string内容以':'分割
默认所有text类型的属性为utf-8编码,但是大多数BitTorrent包含codepage 和 encoding属性,指定了text的编码格式
"announce" -- tracker服务器的地址,为string
"info" ---文件信息,为dict类型
"name" --单文件模式,表示文件名,多文件模式表示根目录名。
"length" --单文件模式表示文件长度,多文件模式不存在
"piece length" --文件分片大小
"pieces" --为一个长string, 没20个字节表示一个分片的SHA1 hash值。按照文件分片的顺序排列。
分片是按照所以文件组合在一起进行的,即一个分片可能会跨越多个文件。
"files" -- 多文件模式存在,为一个文件列表,每个文件为一个dict类型
"path" -- 文件目录列表,最后一项为文件名
"length" --文件长度
"peace length" --分片大小
以下为draft bep定义的属性
"code page"
"announce-list" --tracker列表,为二维数组,即将tracker服务器分为多个组
"encoding" -- Text属性的编码类型,string 类型,如 UTF-8
"publisher" -- 发布者
"publisher url" --发布者 URL
"creater" --创建者,如btcomet,btspirit
"creation date" --创建日期,为UTC格式,需要转化为本地时区可读格式
"commnent" --注释
"nodes" -- DHT 节点列表
BitTorrent的标准参见:http://www.bittorrent.org/beps/bep_0003.html
以下是自己写的Python实现,初学Python,代码写起来还都是C/C++风格,慢慢改进吧。
修改代码,bittorrent文件的解码使用异常处理解决文件格式错误的情况,简化处理过程。
bcodec
1 '''
2 Created on 2012-9-30
3
4 @author: ddt
5 '''
6 class DataEncodedError(BaseException):
7 def __str__(self):
8 return 'Data Encoded Error'
9
10 class DataTypeError(BaseException):
11 def __str__(self):
12 return 'Data Type Error'
13
14 def bdecode(data):
15 try:
16 leading_chr = data[0]
17 #print leading_chr,
18 if leading_chr.isdigit():
19 chunk, length = _read_string(data)
20 #print chunk
21 elif leading_chr == 'd':
22 chunk, length = _read_dict(data)
23 #print chunk is None
24 elif leading_chr == 'i':
25 chunk, length = _read_integer(data)
26 #print chunk
27 elif leading_chr == 'l':
28 chunk, length = _read_list(data)
29 else:
30 raise DataEncodedError()
31 return chunk, length
32 except:
33 raise DataEncodedError()
34
35
36 def _read_dict(data):
37 chunk = {}
38 length = 1
39
40 while data[length] != 'e':
41 key, key_len = bdecode(data[length:])
42 length += key_len
43
44 value, value_len = bdecode(data[length:])
45 length += value_len
46
47 chunk[key] = value
48 #print key
49
50 length += 1
51 return chunk, length
52
53 def _read_list(data):
54 chunk = []
55 length = 1
56 while data[length] != 'e':
57 value, value_len = bdecode(data[length:])
58 chunk.append(value)
59 length += value_len
60
61 length += 1
62 return chunk, length
63
64 def _read_string(data):
65 comm_index = data.find(':')
66 str_len = int(data[:comm_index])
67 value = data[comm_index+1:comm_index+1+str_len]
68
69 length = comm_index + 1 + str_len
70 return ''.join(value), length
71
72 def _read_integer(data):
73
74 end_index = data.find('e')
75 value = int(data[1:end_index])
76 length = end_index + 1
77
78 return value, length
79
80 def bencode(data):
81 data_type = type(data)
82 if data_type == type({}):
83 result = _write_dict(data)
84 elif data_type == type([]):
85 result = _write_list(data)
86 elif data_type == type(''):
87 result = _write_string(data)
88 elif data_type == type(int(0)):
89 result = _write_integer(data)
90 else:
91 raise DataTypeError()
92 return result
93
94 def _write_dict(data):
95 result = 'd'
96 for key, value in data.items():
97 key_encode = bencode(key)
98 value_encode = bencode(value)
99 result += key_encode
100 result += value_encode
101
102 result += 'e'
103 return result
104
105 def _write_list(data):
106 result = 'l'
107 for value in data:
108 value_encode = bencode(value)
109 result += value_encode
110
111 result += 'e'
112 return result
113
114 def _write_string(data):
115 return '%d:%s' %(len(data), data)
116
117 def _write_integer(data):
118 return 'i%de' %data
119
120
torrent_file.py
1 from datetime import datetime
2 import bcodec
3 import hashlib
4
5 _READ_MAX_LEN = -1
6
7 class BTFormatError(BaseException):
8 def __str__(self):
9 return 'Torrent File Format Error'
10
11 class TorrentFile(object):
12
13 def __init__(self):
14 self.__metainfo = {}
15 self.__file_name = ''
16 self.__bencode_data = None
17
18 def read_file(self, filename):
19
20 torrent_file = open(filename, 'rb')
21 data = torrent_file.read(_READ_MAX_LEN)
22 torrent_file.close()
23
24 try:
25 metainfo, length = bcodec.bdecode(data)
26 self.__file_name = filename
27 self.__metainfo = metainfo
28 self.__bencode_data = data
29 except:
30 raise BTFormatError()
31
32 def __is_singlefile(self):
33
34 return self.__get_meta_info('length') != None
35
36 def __decode_text(self, text):
37 encoding = 'utf-8'
38 resultstr = ''
39 if self.get_encoding() != None:
40 encoding = self.get_encoding()
41 elif self.get_codepage() != None:
42 encoding = 'cp' + str(self.get_codepage())
43 if text:
44 try:
45 resultstr = text.decode(encoding=encoding)
46 except ValueError:
47 return text
48 else:
49 return None
50 return resultstr
51
52 def __get_meta_top(self, key):
53 if key in self.__metainfo.keys():
54 return self.__metainfo[key]
55 else:
56 return None
57 def __get_meta_info(self,key):
58 meta_info = self.__get_meta_top('info')
59 if meta_info != None and key in meta_info.keys():
60 return meta_info[key]
61 return None
62
63 def get_codepage(self):
64 return self.__get_meta_top('codepage')
65 def get_encoding(self):
66 return self.__get_meta_top('encoding')
67
68 def get_announces(self):
69 announces = self.__get_meta_top('announce-list')
70 if announces != None:
71 return announces
72
73 announces = [[]]
74 ann = self.__get_meta_top('announce')
75 if ann:
76 announces[0].append(ann)
77 return announces
78
79 def get_publisher(self):
80 return self.__decode_text(self.__get_meta_top('publisher'))
81 def get_publisher_url(self):
82 return self.__decode_text(self.__get_meta_top('publisher-url'))
83
84 def get_creater(self):
85 return self.__decode_text(self.__get_meta_top('created by'))
86 def get_creation_date(self):
87 utc_date = self.__get_meta_top('creation date')
88 if utc_date == None:
89 return utc_date
90 creationdate = datetime.utcfromtimestamp(utc_date)
91 return creationdate
92 def get_comment(self):
93 return self.__get_meta_top('comment')
94
95 def get_nodes(self):
96 return self.__get_meta_top('nodes')
97
98 def get_piece_length(self):
99 return self.__get_meta_info('piece length')
100
101 def get_piece(self, index):
102 pieces = self.__get_meta_info('pieces')
103 if pieces == None:
104 return None
105
106 offset = index*20
107 if offset+20 > len(pieces):
108 return None
109 return pieces[offset:offset+20]
110
111 def get_pieces_num(self):
112 return len(self.__get_meta_info('pieces'))/20
113
114 def get_files(self):
115
116 files = []
117 name = self.__decode_text(self.__get_meta_info('name'))
118 piece_length = self.get_piece_length()
119 if name == None:
120 return files
121
122 if self.__is_singlefile():
123 file_name = name
124 file_length = self.__get_meta_info('length')
125 if not file_length:
126 return files
127
128 pieces_num = file_length/piece_length
129 last_piece_offset = file_length % piece_length
130 if last_piece_offset != 0:
131 pieces_num = int(pieces_num) + 1
132 last_piece_offset -= 1
133 else:
134 last_piece_offset = piece_length - 1
135
136 first_piece_offset = 0
137
138 files.append({'name':[file_name], 'length':file_length, 'first-piece':(0, first_piece_offset), 'last-piece':(pieces_num-1,last_piece_offset)})
139 return files
140
141 folder = name
142 meta_files = self.__get_meta_info('files')
143 if meta_files == None:
144 return files
145
146 total_length = int(0)
147 for one_file in self.__get_meta_info('files'):
148
149 file_info = {}
150 path_list = []
151 path_list.append(folder)
152
153 if 'path' not in one_file.keys():
154 break
155 for path in one_file['path']:
156 path_list.append(self.__decode_text(path))
157 file_info['name'] = path_list
158
159 if 'length' not in one_file.keys():
160 break
161
162 file_info['length'] = one_file['length']
163
164 piece_index = total_length / piece_length
165 first_piece_offset = total_length % piece_length
166
167 total_length += one_file['length']
168 pieces_num = total_length / piece_length - piece_index
169 last_piece_offset = total_length % piece_length
170
171 if last_piece_offset != 0:
172 pieces_num += 1
173 last_piece_offset -= 1
174 else:
175 last_piece_offset = piece_length - 1
176
177 file_info['first-piece'] = (piece_index,first_piece_offset)
178 file_info['last-piece'] = ((piece_index+pieces_num-1),last_piece_offset)
179 files.append(file_info)
180 return files
181
182 def get_info_hash(self):
183 info_index = self.__bencode_data.find('4:info')
184 info_data_index = info_index+len('4:info')
185
186 info_value, info_data_len = bcodec.bdecode(self.__bencode_data[info_data_index:])
187 info_data = self.__bencode_data[info_data_index:info_data_index+info_data_len]
188
189 info_hash = hashlib.sha1()
190 info_hash.update(info_data)
191 return info_hash.digest()
192
193
194 if __name__ == '__main__':
195 filename = r".\narodo.torrent"
196
197 torrent = TorrentFile()
198
199 print "begin to read file"
200 torrent.read_file(filename)
201
202 print "end to read file"
203
204 print "announces: " , torrent.get_announces()
205 print "info_hash: ", list(torrent.get_info_hash())
206 print "peace length:", torrent.get_piece_length()
207 print "code page:" , torrent.get_codepage()
208 print "encoding:" , torrent.get_encoding()
209 print "publisher:" ,torrent.get_publisher()
210 print "publisher url:", torrent.get_publisher_url()
211 print "creater:" , torrent.get_creater()
212 print "creation date:", torrent.get_creation_date()
213 print "commnent:", torrent.get_comment()
214 print "nodes:", torrent.get_nodes()
215 torrent.get_files()
216 for one_file in torrent.get_files():
217 print 'name:', '\\'.join(one_file['name'])
218 print 'length:', one_file['length']
219 print 'first-piece:', one_file['first-piece']
220 print 'last-piece:', one_file['last-piece']
|
|