随机代码

1#jieba分词的主函数,返回结果是一个可迭代的 generator
2    def cut(self, sentence, cut_all=False, HMM=True):
3        '''
4        The main function that segments an entire sentence that contains
5        Chinese characters into seperated words.
6        Parameter:
7            - sentence: The str(unicode) to be segmented.
8            - cut_all: Model type. True for full pattern, False for accurate pattern.
9            - HMM: Whether to use the Hidden Markov Model.
10        '''
11        sentence = strdecode(sentence) # 解码为unicode
12        # 不同模式下的正则
13        if cut_all:
14            re_han = re_han_cut_all
15            re_skip = re_skip_cut_all
16        else:
17            re_han = re_han_default
18            re_skip = re_skip_default
19
20         # 设置不同模式下的cut_block分词方法
21        if cut_all:
22            cut_block = self.__cut_all
23        elif HMM:
24            cut_block = self.__cut_DAG
25        else:
26            cut_block = self.__cut_DAG_NO_HMM
27        # 先用正则对句子进行切分
28        blocks = re_han.split(sentence)
29        for blk in blocks:
30            if not blk:
31                continue
32            if re_han.match(blk): # re_han匹配的串
33                for word in cut_block(blk):# 根据不同模式的方法进行分词
34                    yield word
35            else:# 按照re_skip正则表对blk进行重新切分
36                tmp = re_skip.split(blk)# 返回list
37                for x in tmp:
38                    if re_skip.match(x):
39                        yield x
40                    elif not cut_all: # 精准模式下逐个字符输出
41                        for xx in x:
42                            yield xx
43                    else: 
44                        yield x
参考地址:【结巴分词资料汇编】结巴中文分词源码分析(2)

代码交流 2021