-
Notifications
You must be signed in to change notification settings - Fork 2
/
default.py
62 lines (43 loc) · 3.35 KB
/
default.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
You have to write the perc_train function that trains the feature weights using the perceptron algorithm for the CoNLL 2000 chunking task.
Each element of train_data is a (labeled_list, feat_list) pair.
Inside the perceptron training loop:
- Call perc_test to get the tagging based on the current feat_vec and compare it with the true output from the labeled_list
- If the output is incorrect then we have to update feat_vec (the weight vector)
- In the notation used in the paper we have w = w_0, w_1, ..., w_n corresponding to \phi_0(x,y), \phi_1(x,y), ..., \phi_n(x,y)
- Instead of indexing each feature with an integer we index each feature using a string we called feature_id
- The feature_id is constructed using the elements of feat_list (which correspond to x above) combined with the output tag (which correspond to y above)
- The function perc_test shows how the feature_id is constructed for each word in the input, including the bigram feature "B:" which is a special case
- feat_vec[feature_id] is the weight associated with feature_id
- This dictionary lookup lets us implement a sparse vector dot product where any feature_id not used in a particular example does not participate in the dot product
- To save space and time make sure you do not store zero values in the feat_vec dictionary which can happen if \phi(x_i,y_i) - \phi(x_i,y_{perc_test}) results in a zero value
- If you are going word by word to check if the predicted tag is equal to the true tag, there is a corner case where the bigram 'T_{i-1} T_i' is incorrect even though T_i is correct.
"""
import perc
import sys, optparse, os
import neural_model
from collections import defaultdict
def perc_train(train_data, tagset, numepochs):
feat_vec = defaultdict(int)
# insert your code here
# please limit the number of iterations of training to n iterations
return feat_vec
if __name__ == '__main__':
optparser = optparse.OptionParser()
optparser.add_option("-t", "--tagsetfile", dest="tagsetfile", default=os.path.join("data", "tagset.txt"), help="tagset that contains all the labels produced in the output, i.e. the y in \phi(x,y)")
optparser.add_option("-i", "--trainfile", dest="trainfile", default=os.path.join("data", "train.txt.gz"), help="input data, i.e. the x in \phi(x,y)")
optparser.add_option("-f", "--featfile", dest="featfile", default=os.path.join("data", "train.feats.gz"), help="precomputed features for the input data, i.e. the values of \phi(x,_) without y")
optparser.add_option("-e", "--numepochs", dest="numepochs", default=int(10), help="number of epochs of training; in each epoch we iterate over over all the training examples")
optparser.add_option("-m", "--modelfile", dest="modelfile", default=os.path.join("data", "default.model"), help="weights for all features stored on disk")
(opts, _) = optparser.parse_args()
# each element in the feat_vec dictionary is:
# key=feature_id value=weight
feat_vec = {}
tagset = []
train_data = []
tagset = perc.read_tagset(opts.tagsetfile)
print("reading data ...", file=sys.stderr)
train_data = perc.read_labeled_data(opts.trainfile, opts.featfile, verbose=False)
print("done.", file=sys.stderr)
trained_model = neural_model.neural_train(train_data, tagset, int(opts.numepochs))
neural_model.dump_model(trained_model, opts.modelfile)