-
Notifications
You must be signed in to change notification settings - Fork 1
/
vsm_similarity.py
85 lines (59 loc) · 2.93 KB
/
vsm_similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pickle
import json
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from datasets import DATASET
class Similarity:
__slots__ = ['src_files', 'src_strings']
def __init__(self, src_files):
self.src_files = src_files
self.src_strings = [' '.join(src.file_name['stemmed'] + src.class_names['stemmed']
+ src.method_names['stemmed'] + src.pos_tagged_comments['stemmed']
+ src.attributes['stemmed'])
for src in self.src_files.values()]
def calculate_similarity(self, src_tfidf, reports_tfidf):
"""Calculatnig cosine similarity between source files and bug reports"""
# Normalizing the length of source files
src_lenghts = np.array([float(len(src_str.split()))
for src_str in self.src_strings]).reshape(-1, 1)
min_max_scaler = preprocessing.MinMaxScaler()
normalized_src_len = min_max_scaler.fit_transform(src_lenghts)
# Applying logistic length function
src_len_score = 1 / (1 + np.exp(-12 * normalized_src_len))
simis = []
#for each report, find similarity score for every source code file
for report in reports_tfidf:
s = cosine_similarity(src_tfidf, report)
# revised VSM score calculation
rvsm_score = s * src_len_score
normalized_score = np.concatenate(
min_max_scaler.fit_transform(rvsm_score)
)
simis.append(normalized_score.tolist())
return simis
def find_similars(self, bug_reports):
"""Calculating tf-idf vectors for source and report sets
to find similar source files for each bug report.
"""
reports_strings = [' '.join(report.summary['stemmed'] + report.description['stemmed'])
for report in bug_reports.values()]
tfidf = TfidfVectorizer(sublinear_tf=True, smooth_idf=False)
src_tfidf = tfidf.fit_transform(self.src_strings)
reports_tfidf = tfidf.transform(reports_strings)
simis = self.calculate_similarity(src_tfidf, reports_tfidf)
return simis
def main():
# Unpickle preprocessed data
with open(DATASET.root / 'preprocessed_src.pickle', 'rb') as file:
src_files = pickle.load(file)
with open(DATASET.root / 'preprocessed_reports.pickle', 'rb') as file:
bug_reports = pickle.load(file)
sm = Similarity(src_files)
simis = sm.find_similars(bug_reports)
# Saving similarities in a json file
with open(DATASET.root / 'vsm_similarity.json', 'w') as file:
json.dump(simis, file)
if __name__ == '__main__':
main()