-
Notifications
You must be signed in to change notification settings - Fork 1
/
ptt_word_embedding.py
208 lines (182 loc) · 5.96 KB
/
ptt_word_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import pymongo
from pymongo import MongoClient
import urllib, re, json
import jieba
import gensim
import logging
import pickle
from itertools import chain
# ----- login db ----- #
password = urllib.parse.quote_plus('gjoKClmg8eQDF4pKeVXMkTnX7wL/9MVilkavArDouNA=')
client = MongoClient('mongodb://achiii:' + password + '@140.112.147.132')
# ----- connect to PTT corpus ----- #
db = client['PTT']
# ----- extract all board names ----- #
board_list = db.collection_names()
print(board_list)
# ----- access all boards ----- #
AllTogether = db['AllTogether']
Baseball = db['Baseball']
Beauty = db['Beauty']
Boy_Girl = db['Boy-Girl']
BuyTogether = db['BuyTogether']
ChangHua = db['ChangHua']
Chiayi = db['Chiayi']
ChungLi = db['ChungLi']
Daliao = db['Daliao']
FengYuan = db['FengYuan']
FongShan = db['FongShan']
Food = db['Food']
FuMouDiscuss = db['FuMouDiscuss']
Gossiping = db['Gossiping']
Hate = db['Hate']
Hsinchu = db['Hsinchu']
Hualien = db['Hualien']
I_Lan = db['I-Lan']
Jinmen = db['Jinmen']
Kaohsiung = db['Kaohsiung']
Keelung = db['Keelung']
Linyuan = db['Linyuan']
LoL = db['LoL']
Matsu = db['Matsu']
MenTalk = db['MenTalk']
Miaoli = db['Miaoli']
NBA = db['NBA']
Nantou = db['Nantou']
PH_sea = db['PH-sea']
PingTung = db['PingTung']
PuzzleDragon = db['PuzzleDragon']
Sad = db['Sad']
Stock = db['Stock']
StupidClown = db['StupidClown']
TaichungBun = db['TaichungBun']
TaichungCont = db['TaichungCont']
Tainan = db['Tainan']
Taitung = db['Taitung']
Taoyuan = db['Taoyuan']
ToS = db['ToS']
WomenTalk = db['WomenTalk']
Yunlin = db['Yunlin']
ask = db['ask']
happy = db['happy']
home_sale = db['home-sale']
iPhone = db['iPhone']
joke = db['joke']
movie = db['movie']
prozac = db['prozac']
# ----- set ref corpus ----- #
test_data = []
ptt_posts = []
ptt_boards = [AllTogether, Baseball, Beauty, Boy_Girl, BuyTogether,ChangHua, Chiayi, ChungLi, Daliao,\
FengYuan, FongShan, Food, FuMouDiscuss, Gossiping, Hate, Hsinchu,Hualien, I_Lan, Jinmen,\
Kaohsiung, Keelung, Linyuan, LoL, Matsu, MenTalk, Miaoli, NBA, Nantou, PH_sea, PingTung,\
PuzzleDragon, Sad,Stock, StupidClown, TaichungBun, TaichungCont, Tainan, Taitung, Taoyuan,\
ToS, WomenTalk, Yunlin, ask, happy, home_sale, iPhone, joke, movie, prozac]
for boards in ptt_boards:
print('Processing content in ptt board:', boards.name)
for posts in boards.find({},{'content':True})[:1000]: # change the number of post here!
test_data.append([posts['content']])
# print(test_data) # test_data contains all the posts in ptt_boards
# ----- data cleaning and re-segmentation with user-defined dict ----- #
punctuations = '\n|\?|\?|\.|。|,|\^|\s|=|、|!|\!|\/|\/|\)|\(|\)|>|<|"||\「|\」|\:|◎|\☆|\Σ|\-|~|˙|→|一|_+|'
for content in test_data:
print(test_data.index(content)+1,'/',len(test_data))
for post in content:
# for nums in [position for position, article in enumerate(content)]:
stopword = open('stopword.txt','r', encoding='utf8').read()
jieba.load_userdict('aged.lexicon/target.test.txt') # seg by test word list
clean_content = re.sub(punctuations,'', post)
word = jieba.cut(clean_content, cut_all=False)
ptt_posts.append([i for i in word if i not in stopword])
# print(ptt_posts) # ptt_posts contains all the segmented sentences in all the ptt_boards
# ----- train word embedding ----- #
print('Training Word2Vec model...')
# model = gensim.models.Word2Vec(ptt_posts, min_count=1)
model = gensim.models.word2vec.Word2Vec(sentences= ptt_posts, size=100, alpha=0.025, window=10, min_count=5,\
max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,\
cbow_mean=1, iter=5, null_word=0, trim_rule=None, sorted_vocab=1,\
batch_words=10000) # train ptt sentences with w2v model
# ----- write training result as python pickle file ----- #
print('Writing taining result as pickle...')
filename = 'word_embedding_model.sav'
pickle.dump(model, open(filename, 'wb'))
print('Finished')
# ----- load model from pickle ----- #
loaded_model = pickle.load(open(filename, 'rb'))
# ----- view trained result ----- #
'''
測試:以10個relevant words為一組單位,每一個age category隨機選兩個詞彙,人工檢查該target word (e.g 正妹) 所產出的 relevant words中,
語意不合理的範圍在哪裡,並將不合理的semantic distance值設為threshold,抓出threshold以上的relevant word數量
'''
res = loaded_model.wv.most_similar(positive = ['中國'], topn = 10)
# find the top 10000 relevant words and values to target word
# ----- threshold testing ----- # # let the num be 0.537275
'''
Threshold 的決定方法:將三個category中怪異詞range的最大值和最小值加總平均
最大值:0.60749
最小值:0.46706
最終threshold: 0.537275
'''
w2v_res = []
for relevant_word in res:
if relevant_word[1] > 0.537275:
w2v_res.append(relevant_word[0])
print(len(w2v_res)) # number of relevant words above threshold value
'''
[ptt 總版數] 49個
[ptt 所有版的po文數量(放進w2v訓練的data數量)] 3,888,530
[用來做測試的target word與抓到怪異詞的value (以10個詞為一個range)]
Category: l
word: 淑女
怪異詞range: 變速(0.54412) - 法拉力 (0.53880)
word: 紳士
怪異詞range: 傻大姐(0.48208) - 裝扮 (0.47884)
Category: m
word: 人妻
怪異詞range: 謝承均(0.57328) - 未亡人 (0.56730)
word: 帥哥
怪異詞range: 真正(0.60749) - 正到 (0.60422)
Category: s
word: 肥宅
怪異詞range: 人一(0.46820) - 漂亮女孩 (0.46706)
word: 正妹
怪異詞range: 錢帥(0.49944) - 正到 (0.49867)
[Threshold&在評估值裡面的詞彙數量]
Threshold = 0.537275
Category: l (11 words)
淑女 53
紳士 19
嬌娃 809
姑娘 71
夥計 1
女士 16
君子 154
大哥 31
大姊 not in vocabulary
少年 27
小子 7
Category: m (11 words)
美女 270
帥哥 435
優質 7
人氣 70
人妻 157
色狼 147
女友 124
男友 132
老公 371
老婆 351
太太 124
Category: s (11 words)
潮男 not in vocabulary
網美 378
型男 not in vocabulary
熟女 160
大媽 36
女神 263
正妹 332
美眉 263
肥宅 215
宅男 not in vocabulary
辣妹 223
'''