-
Notifications
You must be signed in to change notification settings - Fork 0
/
worker.py
108 lines (88 loc) · 3.95 KB
/
worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Python Modules
import logging
from logging.handlers import RotatingFileHandler
import sys
# Third Party
import wikipedia
# My modules
from thread_messaging import Response
logDir = 'logs'
# Worker receives a word, finds related articles, and then searches each article for all words in wordlist
class Worker:
def __init__(self, wcfg):
self.cfg = wcfg
# Set up logger
logName = 'worker-{0}'.format(self.cfg['idx'])
logFormat = logging.Formatter("%(asctime)s %(levelname)s: %(threadName)s - %(message)s")
self.log = logging.getLogger(logName)
self.log.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(logFormat)
self.log.addHandler(ch)
fh = logging.handlers.RotatingFileHandler("{0}\{1}.txt".format(logDir, logName), maxBytes=(1048576*5), backupCount=7)
fh.setFormatter(logFormat)
self.log.addHandler(fh)
def parseArticle(self, articleName, articleIndex):
# NOTE: The article index is the same as the index of the bit in the word files
#self.log.debug(" {0}: Parsing article '{1}'".format(articleIndex, articleName))
articleName.encode('ascii', 'ignore')
try:
# Request article summary from Wikipedia
text = wikipedia.summary(articleName)
# Catch exceptions from wikipedia module
except wikipedia.exceptions.DisambiguationError:
self.log.debug("{0}: DisambiguationError parsing '{1}'".format(articleIndex, articleName)) # Expected to happen often
return 0
except wikipedia.exceptions.PageError:
self.log.error("{0}: PageError parsing '{1}'".format(articleIndex, articleName))
return 0
except wikipedia.exceptions.HTTPTimeoutError:
self.log.error("{0}: HTTPTimeoutError parsing '{1}'".format(articleIndex, articleName))
return 0
except wikipedia.exceptions.RedirectError:
self.log.error("{0}: RedirectError parsing '{1}'".format(articleIndex, articleName))
return 0
except wikipedia.exceptions.WikipediaException:
self.log.error("{0}: WikipediaException parsing '{1}'".format(articleIndex, articleName))
return 0
# For each word in our list - attempt to find it in the given article
distinctWordsFound = 0
for word in self.cfg['wordDataList']:
#self.log.debug('Attempting to find word {0}, res = {1}'.format(word.word, word.matcher.search(text)))
if word.isPresentIn(text):
#print("Found word '{0}' in '{1}'".format(word, articleName))
distinctWordsFound += 1
self.cfg['resp_queue'].put(Response(word.word, articleIndex))
self.log.debug("{0}: Found {1} distinct words in '{2}'".format(articleIndex, distinctWordsFound, articleName))
# Any context (article) should have a minimum number of distinct words (2) else it's useless
if distinctWordsFound < self.cfg['overlap']:
return 0
return 1
def run(self):
self.log.info('Starting...')
# Run until worker grabs END token from word_queue
while True:
# Get next word on the queue
wData = self.cfg['word_queue'].get()
# If obtained END token then reply with END and quit
if wData.isEndToken() == True:
self.log.info('Done.')
self.cfg['resp_queue'].put(Response().endToken())
break
self.log.info("{0}/{1}: Finding articles related to '{2}'".format(wData.index, self.cfg['numWords'], wData.word))
# Obtain articles related to word (get more than we actually want since many may fail to meet overlap criteria)
articleList = wikipedia.search(wData.word, results=self.cfg['articlesPerWord']*3)
# Parse each article until we have parsed as many as 'articlesPerWord' denotes
articleIndex = wData.index * self.cfg['articlesPerWord']
count = 0
for articleName in articleList:
if count >= self.cfg['articlesPerWord']:
break
if articleName.endswith('(disambiguation)'):
self.log.debug("Skipping article '{0}'".format(articleName))
continue
success = self.parseArticle(articleName, articleIndex)
count += success
articleIndex += success
self.log.debug("Responding with word index {0}".format(wData.index))
self.cfg['resp_queue'].put(wData.index)