Sphinx中文搜索插件

用Sphinx写文档很爽,用Python做科学计算 就是用Sphinx编译的,但是Sphinx的搜索功能不支持中文,究其原因是因为它不支持中文分词,产生的索引文件searchindex.js中没有正确的中文单词索引。进行分词的程序在Sphinx目录的search.py文件中。此程序使用正则表达式将英文单词分出来,因此只需要将其中的两个使用 word_re变量地方替换为中文分词算法即可。
 
为了维护方便,我没有直接修改此文件,而是做了一个扩展插件,其源程序如下:
 
001# -*- coding: utf-8 -*-
002"""
003    sphinx.search
004    ~~~~~~~~~~~~~
005 
006    Create a search index for offline search.
007 
008    :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
009    :license: BSD, see LICENSE for details.
010"""
011import re
012import cPickle as pickle
013 
014from docutils.nodes import comment, Text, NodeVisitor, SkipNode
015 
016from sphinx.util import jsdump, rpartition
017try:
018    # http://bitbucket.org/methane/porterstemmer/
019    from porterstemmer import Stemmer as CStemmer
020    CSTEMMER = True
021except ImportError:
022    from sphinx.util.stemmer import PorterStemmer
023    CSTEMMER = False
024 
025from smallseg import SEG
026 
027#testfile = file("testfile.txt", "wb")
028 
029word_re = re.compile(r'\w+(?u)')
030 
031stopwords = set("""
032a and are as at
033be but by
034for
035if in into is it
036near no not
037of on or
038such
039that the their then there these they this to
040was will with
041""".split())
042 
043 
044class _JavaScriptIndex(object):
045    """
046    The search index as javascript file that calls a function
047    on the documentation search object to register the index.
048    """
049 
050    PREFIX = 'Search.setIndex('
051    SUFFIX = ')'
052 
053    def dumps(self, data):
054        return self.PREFIX + jsdump.dumps(data) + self.SUFFIX
055 
056    def loads(self, s):
057        data = s[len(self.PREFIX):-len(self.SUFFIX)]
058        if not data or not s.startswith(self.PREFIX) or not \
059           s.endswith(self.SUFFIX):
060            raise ValueError('invalid data')
061        return jsdump.loads(data)
062 
063    def dump(self, data, f):
064        f.write(self.dumps(data))
065 
066    def load(self, f):
067        return self.loads(f.read())
068 
069 
070js_index = _JavaScriptIndex()
071 
072 
073if CSTEMMER:
074    class Stemmer(CStemmer):
075 
076        def stem(self, word):
077            return self(word.lower())
078 
079else:
080    class Stemmer(PorterStemmer):
081        """
082        All those porter stemmer implementations look hideous.
083        make at least the stem method nicer.
084        """
085 
086        def stem(self, word):
087            word = word.lower()
088            return PorterStemmer.stem(self, word, 0, len(word) - 1)
089 
090 
091 
092class WordCollector(NodeVisitor):
093    """
094    A special visitor that collects words for the `IndexBuilder`.
095    """
096 
097    def __init__(self, document):
098        NodeVisitor.__init__(self, document)
099        self.found_words = []
100 
101    def dispatch_visit(self, node):
102        if node.__class__ is comment:
103            raise SkipNode
104        if node.__class__ is Text:
105            words = seg.cut(node.astext().encode("utf8"))
106            words.reverse()
107            self.found_words.extend(words)
108            #testfile.write(node.astext().encode("utf8")+"\n")
109            #testfile.write(u",".join(words).encode("utf8") + "\n")
110 
111class IndexBuilder(object):
112    """
113    Helper class that creates a searchindex based on the doctrees
114    passed to the `feed` method.
115    """
116    formats = {
117        'jsdump': jsdump,
118        'pickle': pickle
119    }
120 
121    def __init__(self, env):
122        self.env = env
123        self._stemmer = Stemmer()
124        # filename -> title
125        self._titles = {}
126        # stemmed word -> set(filenames)
127        self._mapping = {}
128        # objtype -> index
129        self._objtypes = {}
130        # objtype index -> objname (localized)
131        self._objnames = {}
132 
133    def load(self, stream, format):
134        """Reconstruct from frozen data."""
135        if isinstance(format, basestring):
136            format = self.formats[format]
137        frozen = format.load(stream)
138        # if an old index is present, we treat it as not existing.
139        if not isinstance(frozen, dict):
140            raise ValueError('old format')
141        index2fn = frozen['filenames']
142        self._titles = dict(zip(index2fn, frozen['titles']))
143        self._mapping = {}
144        for k, v in frozen['terms'].iteritems():
145            if isinstance(v, int):
146                self._mapping[k] = set([index2fn[v]])
147            else:
148                self._mapping[k] = set(index2fn[i] for i in v)
149        # no need to load keywords/objtypes
150 
151    def dump(self, stream, format):
152        """Dump the frozen index to a stream."""
153        if isinstance(format, basestring):
154            format = self.formats[format]
155        format.dump(self.freeze(), stream)
156 
157    def get_objects(self, fn2index):
158        rv = {}
159        otypes = self._objtypes
160        onames = self._objnames
161        for domainname, domain in self.env.domains.iteritems():
162            for fullname, dispname, type, docname, anchor, prio in \
163                    domain.get_objects():
164                # XXX use dispname?
165                if docname not in fn2index:
166                    continue
167                if prio < 0:
168                    continue
169                # XXX splitting at dot is kind of Python specific
170                prefix, name = rpartition(fullname, '.')
171                pdict = rv.setdefault(prefix, {})
172                try:
173                    i = otypes[domainname, type]
174                except KeyError:
175                    i = len(otypes)
176                    otypes[domainname, type] = i
177                    otype = domain.object_types.get(type)
178                    if otype:
179                        # use unicode() to fire translation proxies
180                        onames[i] = unicode(domain.get_type_name(otype))
181                    else:
182                        onames[i] = type
183                pdict[name] = (fn2index[docname], i, prio)
184        return rv
185 
186    def get_terms(self, fn2index):
187        rv = {}
188        for k, v in self._mapping.iteritems():
189            if len(v) == 1:
190                fn, = v
191                if fn in fn2index:
192                    rv[k] = fn2index[fn]
193            else:
194                rv[k] = [fn2index[fn] for fn in v if fn in fn2index]
195        return rv
196 
197    def freeze(self):
198        """Create a usable data structure for serializing."""
199        filenames = self._titles.keys()
200        titles = self._titles.values()
201        fn2index = dict((f, i) for (i, f) in enumerate(filenames))
202        terms = self.get_terms(fn2index)
203        objects = self.get_objects(fn2index) # populates _objtypes
204        objtypes = dict((v, k[0] + ':' + k[1])
205                        for (k, v) in self._objtypes.iteritems())
206        objnames = self._objnames
207        return dict(filenames=filenames, titles=titles, terms=terms,
208                    objects=objects, objtypes=objtypes, objnames=objnames)
209 
210    def prune(self, filenames):
211        """Remove data for all filenames not in the list."""
212        new_titles = {}
213        for filename in filenames:
214            if filename in self._titles:
215                new_titles[filename] = self._titles[filename]
216        self._titles = new_titles
217        for wordnames in self._mapping.itervalues():
218            wordnames.intersection_update(filenames)
219 
220    def feed(self, filename, title, doctree):
221        """Feed a doctree to the index."""
222        self._titles[filename] = title
223 
224        visitor = WordCollector(doctree)
225        doctree.walk(visitor)
226 
227        def add_term(word, stem=self._stemmer.stem):
228            word = stem(word)
229            if len(word) < 2 or word in stopwords or word.isdigit():
230                return
231            self._mapping.setdefault(word, set()).add(filename)
232        words = seg.cut(title.encode("utf8"))
233        for word in words:
234            add_term(word)
235        for word in visitor.found_words:
236            add_term(word)
237                
238def load_indexer(self):
239    def func(docnames):
240        import os.path as path
241        print "############### CHINESE INDEXER ###############"
242        self.indexer = IndexBuilder(self.env)
243        keep = set(self.env.all_docs) - set(docnames)
244        try:
245            f = open(path.join(self.outdir, self.searchindex_filename), 'rb')
246            try:
247                self.indexer.load(f, self.indexer_format)
248            finally:
249                f.close()
250        except (IOError, OSError, ValueError):
251            if keep:
252                self.warn('search index couldn\'t be loaded, but not all '
253                          'documents will be built: the index will be '
254                          'incomplete.')
255        # delete all entries for files that will be rebuilt
256        self.indexer.prune(keep)
257    return func
258 
259def builder_inited(app):
260    if app.builder.name == 'html':
261        print "****************************"
262        global seg
263        seg = SEG()
264        app.builder.load_indexer = load_indexer(app.builder)
265 
266def setup(app):
267    app.connect('builder-inited', builder_inited)

 
 这个扩展插件使用smallseg中文分词库进行中文分词。

smallseg中文分词库下载地址: http://code.google.com/p/smallseg

loading...