-
Notifications
You must be signed in to change notification settings - Fork 1
/
semantic_similarity.py
60 lines (41 loc) · 1.79 KB
/
semantic_similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import en_vectors_web_lg
import pickle
import json
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from datasets import DATASET
def calculate_similarity(src_files, bug_reports):
# Loading word vectors
nlp = en_vectors_web_lg.load()
src_docs = [nlp(' '.join(src.file_name['unstemmed'] + src.class_names['unstemmed']
+ src.attributes['unstemmed'] + src.comments['unstemmed']
+ src.method_names['unstemmed']))
for src in src_files.values()]
min_max_scaler = MinMaxScaler()
all_simis = []
for report in bug_reports.values():
report_doc = nlp(' '.join(report.summary['unstemmed']
+ report.pos_tagged_description['unstemmed']))
scores = []
for src_doc in src_docs:
#calculating similarity using cosine similarity
simi = report_doc.similarity(src_doc)
scores.append(simi)
scores = np.array([float(count)
for count in scores]).reshape(-1, 1)
#normalizing similarity
normalized_scores = np.concatenate(
min_max_scaler.fit_transform(scores)
)
all_simis.append(normalized_scores.tolist())
return all_simis
def main():
with open(DATASET.root / 'preprocessed_src.pickle', 'rb') as file:
src_files = pickle.load(file)
with open(DATASET.root / 'preprocessed_reports.pickle', 'rb') as file:
bug_reports = pickle.load(file)
all_simis = calculate_similarity(src_files, bug_reports)
with open(DATASET.root / 'semantic_similarity.json', 'w') as file:
json.dump(all_simis, file)
if __name__ == '__main__':
main()