解析Star Dict文件

作者 : RY    标签: array
切换行号 全选 下载代码

读取Star Dict的字典数据文件,使用array减小内存使用量。

01from array import array
02import bisect
03import struct
04import operator
05import sys
06 
07class WordList(object):
08    def __init__(self, words):
09        self.buf = "".join(words)
10        self.offsets = array("L", [0])
11        self.lengths = array("B", [])
12        for i, s in enumerate(words[:-1]):
13            self.offsets.append(len(s) + self.offsets[-1])
14        for s in words:
15            self.lengths.append(len(s))
16             
17    def __getitem__(self, index):
18        start = self.offsets[index]
19        end = start + self.lengths[index]
20        return self.buf[start:end]
21     
22    def __len__(self):
23        return len(self.offsets)
24 
25    def __sizeof__(self):
26        return sum(sys.getsizeof(item) for item in (self.buf, self.offsets, self.lengths))
27     
28     
29class Index(object):
30     
31    def __init__(self, words, offsets, lengths):
32        self.words = WordList(words)
33        self.offsets = array("L", offsets)
34        self.lengths = array("L", lengths)
35         
36    def search(self, word):
37        idx = bisect.bisect_left(self.words, word)
38        return idx, self.words[idx], self.offsets[idx], self.lengths[idx]
39     
40    def __sizeof__(self):
41        return sum(sys.getsizeof(item) for item in (self.words, self.offsets, self.lengths))
42 
43 
44def load_index(idxfn):
45    def _load_index():
46        info_struct = struct.Struct(">LL")
47        with open(idxfn, "rb") as f:
48            data = f.read()
49            start = 0
50            while True:
51                end = data.find("\x00", start)
52                if end < 0:
53                    break
54                info = info_struct.unpack_from(data, end+1)
55                yield data[start:end], info[0], info[1]
56                start = end + 9
57     
58    words = list(_load_index())
59    words.sort(key=operator.itemgetter(0))
60    return Index(*zip(*words))
61 
62 
63class Dict(object):
64    def __init__(self, idxfn, dictfn):
65        self.index = load_index(idxfn)
66        self.dictfn = dictfn
67         
68    def search(self, word):
69        _, word2, offset, length = self.index.search(word)
70        with open(self.dictfn, "rb") as f:
71            f.seek(offset)
72            text = f.read(length)
73        return word2.decode("utf8") + "\n" + text.decode("utf8")
74     
75    def __sizeof__(self):
76        return sum(sys.getsizeof(item) for item in (self.index, self.dictfn))
77 
78if __name__ == "__main__":
79    e2c = Dict("sun_dict_e2c.idx", "sun_dict_e2c.dict")
80    print "memory:", sys.getsizeof(e2c)
81    print e2c.search("python")

loading...