From 312bac440b74250fa916f3fc43ac3d6ec652be38 Mon Sep 17 00:00:00 2001 From: Yue Fei Date: Fri, 23 Aug 2024 14:02:34 +0800 Subject: [PATCH] Fix jieba bug (#163) --- src/pai_rag/utils/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pai_rag/utils/tokenizer.py b/src/pai_rag/utils/tokenizer.py index 076c76fb..92d71e64 100644 --- a/src/pai_rag/utils/tokenizer.py +++ b/src/pai_rag/utils/tokenizer.py @@ -14,7 +14,7 @@ ## PUT in utils file and add stopword in TRIE structure. def jieba_tokenizer(text: str) -> List[str]: tokens = [] - for w in jieba.lcut(text): + for w in jieba.cut(text): token = w.lower() if not stop_trie.match(token): tokens.append(token)