This repository has been archived by the owner on Jul 1, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
master_planner.rb
327 lines (276 loc) · 8.68 KB
/
master_planner.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
require 'net/http'
require 'uri'
require 'date'
require 'nokogiri'
require 'json'
require 'iron_mq'
class MasterPlanner
@@date_format = '%m/%d/%Y'
@@events_body_selector = '#contentright > div[class^="evtList"]'
@@events_date_selector = 'div.evtList_Date'
@@event_selector = 'div.evtList_Evt'
def initialize(options)
@credentials = options[:credentials]
@city = options[:city]
@session_cookie = ''
setup_endpoints
end
def setup_endpoints
@login_endpoint = {
:url => "http://masterplanneronline.com/Handlers/Login.ashx?region=#{@city}",
:method => 'POST',
:form_data => @credentials
}
@calendar_endpoint = {
:url => "http://masterplanneronline.com/#{@city}",
:method => 'POST',
:form_data => {
'__EVENTTARGET' => '',
'__EVENTARGUMENT' => '',
'ctl00$ddSearchType' => -1,
'ctl00$ddDateRange' => 'Custom',
'ctl00$txtSearch' => 'Gala Fundraiser Benefit',
'ctl00$txtDateFrom' => Date.today.strftime(@@date_format),
'ctl00$txtDateTo' => (Date.today >> 12).strftime(@@date_format)
}
}
end
def login
uri = URI @login_endpoint[:url]
http = Net::HTTP.new uri.host, uri.port
req = Net::HTTP::Post.new uri.request_uri
req.set_form_data @credentials
puts ">> Attempting login..."
res = http.request req
if res.code == "200"
puts ">> Login successful"
else
puts ">> Login error (#{res.code})"
puts res.body.inspect
end
@session_cookie = res.response['set-cookie'].split('; ').first
end
def fetch_events_body
uri = URI @calendar_endpoint[:url]
http = Net::HTTP.new uri.host, uri.port
req = Net::HTTP::Post.new uri.request_uri
req['Cookie'] = @session_cookie
req.set_form_data @calendar_endpoint[:form_data]
start_time = DateTime.now
puts ">> Beginning events request"
res = http.request req
end_time = DateTime.now
time_difference = ((end_time - start_time) * 24 * 60 * 60).to_i
puts ">> Completed events request in #{time_difference}s"
return res.body
end
def process_events_body
cache_location = 'mp.cache'
if File.exist? cache_location
body = IO.read(cache_location)
else
body = fetch_events_body
end
return Nokogiri::HTML(body).css(@@events_body_selector)
end
def preprocess_events_list
events_body = process_events_body
event_date_nodes = events_body.select { |node|
node.attr('class').split(' ').first == @@events_date_selector.split('.').last
}
event_date_ranges = event_date_nodes.each_with_index.map { |node, index|
begin_range = events_body.index(node) + 1
end_range =
if index == event_date_nodes.length - 1
events_body.length - 1
else
events_body.index(event_date_nodes[index + 1])
end
{
:date => Date.parse(node.at('h2').text),
:range => (begin_range...end_range)
}
}
events = []
event_date_ranges.each do |event_date_range|
event_date_range[:range].each do |event_node_index|
event = {}
event[:node] = events_body[event_node_index]
event[:date] = event_date_range[:date]
events.push event
end
end
return events
end
def process_events_list
processed_events = []
events = preprocess_events_list
events.each do |event|
processed_event = parse_event_node event[:node]
processed_event[:date] = event[:date]
processed_events.push processed_event
end
processed_events
end
def parse_mp_block_text(text)
parsed = text.gsub("\r\n",'').squeeze(" ").split(/\u00A0{2}/).map! { |field|
field.strip.gsub(/\.$/,'')
}
parsed.reject! { |line|
line.empty? # Remove empty lines
}
parsed
end
def parse_event_node(node)
event = {}
######## HEADER FIELDS ########
# Preparation Steps: None Required
# (1) Organization Name
# Required: True
# Type: String
# Strategy: Selector
event[:organization] = node.at('.evtList_Evt_Head h3').text.strip
# (2) Event Name
# Required: True
# Type: String
# Strategy: Selector
event[:name] = node.at('.evtList_Evt_Info .EvtTitle').text.strip
# (3) MasterPlanner Description
# Description: The full description block to assist with debugging and error correction
# Required: True
# Type: String
# Strategy: Selector
# event[:mp_description] = node.at('.evtList_Evt_Info').text
######## DESCRIPTION BLOCK ########
# Preparation Steps:
# - Get text node (3rd child)
# - Delete newlines
# - Remove multiple spacing
# - Split into array using regex for two consecutive non-breaking spaces using its unicode char
# - Remap array to strip leading and trailing whitespace for each extracted property as well as trailing
mp_description_array = parse_mp_block_text node.at('.evtList_Evt_Info').children[2].text
mp_extra_array = parse_mp_block_text node.at('.evtList_ExtInfo').text
event_fields_array = mp_description_array | mp_extra_array
matches, unmatched = match_fields event_fields_array, {
:start_time => {
:regex => /\A\d?[0-2]?:[0-5][0-9] (am|pm)\z/,
:unprefixed => true
},
:attire => {
:regex => / attire$/
},
:invitation_only => {
:regex => /^Invitation only$/,
:boolean => true
},
:speakers => {
:regex => /^Speaker\(s\): /,
:array => true
},
:honorees => {
:regex => /^Honoring /,
:array => true
},
:chairs => {
:regex => /^Chaired by /,
:array => true
},
:co_chairs => {
:regex => /^Co-chaired by /,
:array => true
},
:hosts => {
:regex => /^Hosted by /,
:array => true
},
:ticket_price => {
:regex => /^Tickets from \$/,
:number => true
},
:table_price => {
:regex => /^Tables from \$/,
:number => true
},
:contact_name => {
:regex => /^Contact: /
},
:contact_phone => {
# Match numbers in format (999){whitespace or nbsp}999-9999
:regex => /^\(\d{3}\)(\s|\u00a0)\d{3}-\d{4}$/,
:unprefixed => true
},
:website => {
:regex => /^Event web address: /
},
:address => {
:regex => /^Event address: /
}
}
event.merge! matches
city = unmatched.grep(/^(New York|New York City|Brooklyn|Bronx)$/).first
if city
venue = unmatched[unmatched.index(city) - 1]
unmatched.delete city
unmatched.delete venue
event[:city] = city
event[:venue] = venue
end
tags = unmatched.grep(/^Tags: /).first
if tags
unmatched.delete tags
tags.gsub!(/^Tags: /,'')
tags = tags.split(" ")
event[:cause_tags] = select_cause_tags tags
event[:auction] = true if tags.include? "Auction"
end
event[:comments] = unmatched
return event
end
def match_fields(source_array, matcher_hash)
matches = {}
unmatched = source_array
matcher_hash.each do |property, params|
match = source_array.grep params[:regex]
if match.length >= 2
puts "Matched #{match.length} times for #{property}"
puts match
elsif match.empty?
# puts "No match for #{property}"
else
match = match.first
unmatched.delete match
match.gsub!(params[:regex], '') unless params[:unprefixed]
if params[:array]
matches[property] = match.split ', '
elsif params[:boolean]
matches[property] = true
elsif params[:number]
matches[property] = match.gsub(',','').to_i
else
matches[property] = match
end
end
end
return matches, unmatched
end
def select_cause_tags(tag_array)
cause_tags = ["Africa", "Animal Rights/Welfare", "Anti Defamation", "Arts Film", "Arts Libraries", "Arts Performing Arts", "Arts Visual Arts", "Association", "Child Advocacy", "Civic", "Civil/Human Rights", "Cultural", "Economic Development", "Education Adult", "Education K-12", "Education University", "Environmental", "Faith Catholic", "Faith Christian", "Faith Hindu", "Faith Jewish", "Faith Muslim", "Faith Protestant", "Family/Social Services", "Gay Rights", "Health Addictions", "Health Aging", "Health AIDS", "Health Alzheimer’s", "Health Arthritis", "Health Autism", "Health Autoimmune Disease", "Health Blind", "Health Cancer", "Health Clinic", "Health Clinics/Health Care", "Health Diabetes", "Health Digestive Diseases", "Health Heart Disease", "Health Hospitals", "Health Mental Health", "Health Misc.", "Health MS", "Health Neurological", "Health Orthopedic", "Health Parkinson’s ", "Health Pediatrics", "Health Research", "Medical Research", "Museum", "Political Action", "Politics Democratic", "Politics Independent", "Politics Republican", "Public Television", "Social Services Counseling", "Social Services Food Banks", "Social Services Homeless"]
tag_array.select {|tag|
cause_tags.include? tag
}
end
def push_event(event)
ironmq = IronMQ::Client.new
queue = ironmq.queue("scraped-events")
queue.post event.to_json
end
end
mp = MasterPlanner.new({
:city => 'newyork',
:credentials => JSON.parse(IO.read('credentials.json'))
})
mp.login
events = mp.process_events_list
events.each do |event|
mp.push_event event
end