-
Notifications
You must be signed in to change notification settings - Fork 0
/
training-summarizer.py
executable file
·29 lines (23 loc) · 1.16 KB
/
training-summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/usr/bin/env python3
import argparse
from utils import word_counter
from math import log
# constants
DESC = '''This application calculates the inverse document frequencies (idf) of every word in the training dataset.'''
TRAINING_DIRECTORY_DESC = '''The directory containing text files to be used in training.'''
OUTPUT_FILE_DESC = '''File containing every word in training,
the amount of times each word was seen and its corresponding idf.'''
if __name__ == '__main__':
# setup argument parser
parser = argparse.ArgumentParser(prog='training-summarizer', description=DESC)
parser.add_argument('TRAINING_DIRECTORY', type=str, help=TRAINING_DIRECTORY_DESC)
parser.add_argument('OUTPUT_FILE', type=str, help=OUTPUT_FILE_DESC)
args = parser.parse_args()
# compute word counts and idf values for words in training
word_counts, number_of_documents = word_counter(args.TRAINING_DIRECTORY)
# output training data
with open(args.OUTPUT_FILE, 'w') as outfile:
for key in word_counts.keys():
outfile.write(
key + '\t' + str(word_counts[key]) + '\t' + str(log(number_of_documents/word_counts[key])) + '\n'
)