-
Notifications
You must be signed in to change notification settings - Fork 1
/
refine_doc.py
49 lines (37 loc) · 1.39 KB
/
refine_doc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
import re
from gpt_index import Document, SimpleDirectoryReader
def load_buddha_dict(dict_path='buddha-dict.txt'):
with open(dict_path, 'r', encoding='utf-8') as f:
words = f.read().splitlines()
return words
def add_space_around_words(text, words):
for word in words:
text = text.replace(word, f' {word} ')
return text
def merge_consecutive_spaces(text):
return re.sub(r'\s+', ' ', text)
def refine_doc(directory_path, output_dir='output'):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
reader = SimpleDirectoryReader(directory_path)
documents = reader.load_data()
refined_documents = []
add_space_after = ',。、:”?!;》】)'
add_space_before = '“《【('
buddha_dict = load_buddha_dict()
for i, doc in enumerate(documents):
if isinstance(doc, Document):
doc = str(doc)
for char in add_space_after:
doc = doc.replace(char, char + ' ')
for char in add_space_before:
doc = doc.replace(char, ' ' + char)
doc = add_space_around_words(doc, buddha_dict)
doc = merge_consecutive_spaces(doc)
refined_documents.append(doc)
#with open(os.path.join(output_dir, f'output{i+1}.txt'), 'w', encoding='utf-8') as f:
# f.write(doc)
return refined_documents
# 使用函数
refined_docs = refine_doc('input')