-
Notifications
You must be signed in to change notification settings - Fork 2
/
HW 2 Mitchell Layton R Code.R
556 lines (384 loc) · 26.2 KB
/
HW 2 Mitchell Layton R Code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
# Mitchell Layton: 912307956
# STA 141A Homework Assignment #2: Due Tuesday, October 31 by 5:00 pm
# references http://ggplot2.tidyverse.org/reference/index.html
# http://www.sthda.com/english/wiki/ggplot2-axis-scales-and-transformations
# for ggplot, and plot trasformations and edits
# http://erdavenport.github.io/R-ecology-lesson/04-dplyr.html for R tutorials and basics
# Other references are scattered throughout code right before each piece of codes that needs referencing.
library(tidyverse)
library(ggmosaic)
library(plyr)
library(ggthemes)
library(grid)
library(sqldf)
library(reshape)
library(plotly)
library(scales)
library(car)
library(RColorBrewer)
# Note I commented out my variables p1, p2, p3, p4 which are the residual and Q-Q... etc
# plots of the linear models in 7 and 8 b/c it messed up R script when running whole thing.
set.seed(1234)
airfare <- read_csv("E:/Davis/STA 141A/DATA/airfare.zip")
cpi_1996_2017 <- read_csv("E:/Davis/STA 141A/DATA/cpi_1996_2017.csv")
# 1)
table_1a = filter(airfare, table %in% c("1a"))
table_6 = filter(airfare, table %in% c("6"))
#2)
# To find the timespan that the data covers we use the "year" column vector
min_year_1a = min(table_1a$year)
max_year_1a = max(table_1a$year)
min_quarter_1a = min(table_1a$quarter)
max_quarter_1a = max(table_1a$quarter)
min_year_6 = min(table_6$year)
max_year_6 = max(table_6$year)
min_quarter_6 = min(table_6$quarter)
max_quarter_6 = max(table_6$quarter)
min_year = min(cbind(min_year_1a,min_year_6))
max_year = max(cbind(max_year_1a,max_year_6))
min_quarter = min(cbind(min_quarter_1a,min_quarter_6))
max_quarter = max(cbind(max_quarter_1a,max_quarter_6))
cat("The timespan that this data covers is from quarter",min_quarter, min_year,"to quarter", max_quarter, max_year)
# (1996-2017)
# To find any gaps in the "years" or "quarters" column vectors where there are NA's.
NAs_Quarter_1a = table_1a$quarter[is.na(table_1a$quarter)]
NAs_Quarter_6 = table_6$quarter[is.na(table_6$quarter)]
NAs_Year_1a = table_1a$year[is.na(table_1a$year)]
NAs_Year_6 = table_6$year[is.na(table_6$year)]
# To look for any missing NA's or patterns
table(is.na(table_1a$lg_carrier))
table(is.na(table_6$lg_carrier))
table(is.na(table_1a$low_carrier))
table(is.na(table_6$low_carrier))
# Notice that table 1a have the most missing values in both of the carrier variables.
table(is.na(table_1a$low_fare))
table(is.na(table_6$low_fare))
table(is.na(table_1a$lg_fare))
table(is.na(table_6$lg_fare))
table(is.na(table_1a$low_marketshare))
table(is.na(table_6$low_marketshare))
table(is.na(table_1a$lg_marketshare))
table(is.na(table_6$lg_marketshare))
#3)
# Create a table from table 6 of both city 1 and city 2 and find the max value of each to determine the most frequent cities
# This tells us that those cities have the most connections with other cities
by_year1 = split(table_6$city1, table_6$year, drop = T)
by_year1 = table(by_year1$`2017`)
max1 = max(by_year1)
ATL = names(by_year1[which.max(by_year1)])
by_year2 = split(table_6$city2, table_6$year, drop = T)
by_year2 = table(by_year2$`2017`)
max2 = max(by_year2)
DC = names(by_year2[which.max(by_year2)])
cat("\nIn 2017, the cities with the most connections were,",ATL,"and",DC,"with", max1,"and",max2,"connections respectively.")
new_table3 = table(table_6$city1)
new_table4 = table(table_6$city2)
MN = names(new_table3[which.min(new_table3)])
MS = names(new_table4[which.min(new_table4)])
max3 = min(new_table3)
max4 = min(new_table4)
cat("\nIn 2017, the cities with the least connections were,",MN,"and",MS,"with", max3,"and",max4,"connections respectively.")
# Now we split the data by the city and year using the split() functions based on year 2007.
# Then we will store name and the count of each variable
by_year1 = split(table_6$city1, table_6$year, drop = T)
by_year1 = table(by_year1$`2007`)
max1 = max(by_year1)
ATL = names(by_year1[which.max(by_year1)])
by_year2 = split(table_6$city2, table_6$year, drop = T)
by_year2 = table(by_year2$`2007`)
max2 = max(by_year2)
DC = names(by_year2[which.max(by_year2)])
cat("\nIn 2007, the cities with the most connections were,",ATL,"and",DC,"with", max1,"and",max2,"connections respectively.")
by_year3 = split(table_6$city1, table_6$year, drop = T)
by_year3 = table(by_year3$`2007`)
min1 = min(by_year3)
GA = names(by_year3[which.min(by_year3)])
by_year4 = split(table_6$city2, table_6$year, drop = T)
by_year4 = table(by_year4$`2007`)
min2 = min(by_year4)
TX = names(by_year4[which.min(by_year4)])
cat("\nIn 2007, the cities with the least connections were,",GA,"and",TX,"with", min1,"and",min2,"connections respectively.")
# Lastly we need to split the data according to the city based on year 1997
by_year1 = split(table_6$city1, table_6$year, drop = T)
by_year1 = table(by_year1$`1997`)
max1 = max(by_year1)
ATL = names(by_year1[which.max(by_year1)])
by_year2 = split(table_6$city2, table_6$year, drop = T)
by_year2 = table(by_year2$`1997`)
max2 = max(by_year2)
DC = names(by_year2[which.max(by_year2)])
cat("\nIn 1997, the cities with the most connections were,",ATL,"and",DC,"with", max1,"and",max2,"connections respectively.")
by_year3 = split(table_6$city1, table_6$year, drop = T)
by_year3 = table(by_year3$`1997`)
min1 = min(by_year3)
GA = names(by_year3[which.min(by_year3)])
by_year4 = split(table_6$city2, table_6$year, drop = T)
by_year4 = table(by_year4$`1997`)
min2 = min(by_year4)
TX = names(by_year4[which.min(by_year4)])
cat("\nIn 1997, the cities with the least connections were,",GA,"and",TX,"with", min1,"and",min2,"connections respectively.")
# Lastly, we need to find the max value the largest increased connectivity
by_city1 = split(table_6$city1, table_6$year, drop = T)
by_1997_1 = table(by_city1$`1997`)
by_2017_1 = table(by_city1$`2017`)
by_city2 = split(table_6$city2, table_6$year, drop = T)
by_1997_2 = table(by_city2$`1997`)
by_2017_2 = table(by_city2$`2017`)
x_1997 = data.frame(count(by_1997_1))
x_2017 = data.frame(count(by_2017_1))
xx_1997 = data.frame(count(by_1997_2))
xx_2017 = data.frame(count(by_2017_2))
# For city 1
city1 = merge(x_1997,x_2017, by = "x.Var1")
# To delete the unneccesary columns
drops = c("freq.x", "freq.y")
city1 = city1[ , !(names(city1) %in% drops)]
colnames(city1) = c("City/State", " (YEAR) 1997", " (YEAR) 2017")
city1$increased_connections = (city1$` (YEAR) 2017` - city1$` (YEAR) 1997`)
city1$increased_connections = abs(city1$increased_connections)
city1 = city1 %>%
arrange(desc(increased_connections))
biggest_city1 = head(city1, 5)
biggest_city1
# For city2
city2 = merge(xx_1997,xx_2017, by = "x.Var1")
# To delete the unneccesary columns
drops = c("freq.x", "freq.y")
city2 = city2[ , !(names(city2) %in% drops)]
colnames(city2) = c("City/State", " (YEAR) 1997", " (YEAR) 2017")
city2$increased_connections = (city2$` (YEAR) 2017` - city2$` (YEAR) 1997`)
city2$increased_connections = abs(city2$increased_connections)
city2 = city2 %>%
arrange(desc(increased_connections))
biggest_city2 = head(city2, 5)
biggest_city2
#4)
# Aggregate function is best here because it is like tapply but the output is a data frame where we specify our numerical variable first followed by the categorical variables and the FUN function to apply
agg_funct_cities = aggregate(passengers ~ quarter + year, table_6, sum)
agg_funct_airports = aggregate(passengers ~ quarter + year, table_1a, sum)
agg_funct = aggregate(passengers ~ quarter + year, airfare, sum)
# plot function using ggplot and the 2d density geom function. We set the data equal to our agg_function for citites and thenn pass the year on the x axis, and passengers on the y to the aes().
# Then I faceted the data based on the quarters so we can see the yearly trends of each of the quarters of passnegers over time.
plot1 = ggplot(data = agg_funct, aes(year, passengers)) +
geom_density_2d(stat="identity") +
facet_wrap(~ quarter, nrow = 2) +
labs(x = "Year", y = "Approximate Total # of Passengers", title = "Breakdown of Distinct Quarterly Total Passengers Over the Years (Cities & Airports)") +
theme(plot.title = element_text(size = rel(1.65))) +
theme(axis.title.y = element_text(size = rel(1.77))) +
theme(axis.text.x = element_text(size = rel(1.75))) +
theme(axis.text.x=element_text(angle=25, hjust= .5)) +
theme(axis.title.x = element_text(size = rel(1.85))) +
theme(axis.text.y = element_text(size = rel(1.35))) +
theme(axis.ticks.length=unit(0.25,"cm")) +
theme(axis.ticks = element_line(size = rel(2))) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm")) +
theme(strip.text.x = element_text(size = 17, colour = "red"))
plot1
plot2 = ggplot(data = agg_funct, aes(factor(year), passengers)) +
geom_violin(scale = "count", adjust = 1.25, aes(fill = factor(year)), draw_quantiles = c(0.25, 0.5, 0.75)) +
labs(x = "Year", y = "Approximate Total # of Passengers", title = "Violin Plot of Total Passengers over time (with quantiles)") +
theme_few() + theme(plot.title = element_text(size = rel(1.65))) +
theme(axis.title.y = element_text(size = rel(1.77))) +
theme(axis.text.x = element_text(size = rel(1.75))) +
theme(axis.text.x=element_text(angle=25, hjust= .5)) +
theme(axis.title.x = element_text(size = rel(1.85))) +
theme(axis.text.y = element_text(size = rel(1.35))) +
theme(axis.ticks.length=unit(0.25,"cm")) +
theme(axis.ticks = element_line(size = rel(2))) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm")) +
theme(strip.text.x = element_text(size = 22, colour = "red", angle = 22.5)) +
theme(legend.title = element_text(size = rel(1.65)))
plot2
#5)
# Learned of the reshape() methodology of this problem from Piazza about question 5 from anonymous
# Prefer SQL statements sometimes because of my experience from last summer internship
temp = sqldf("SELECT year, quarter, fare FROM airfare")
# Remap factor levels for easier merge and join
temp$quarter = mapvalues(temp$quarter, from = c("1", "2", "3", "4"), to = c("Q1", "Q2", "Q3", "Q4"))
# Method: Found CPI for Q1 - Q4 by taking average of 3 month groupings
cpi_1996_2017$Q1 = ((cpi_1996_2017$Jan) + (cpi_1996_2017$Feb) + (cpi_1996_2017$Mar)) / 3
cpi_1996_2017$Q2 = ((cpi_1996_2017$Apr) + (cpi_1996_2017$May) + (cpi_1996_2017$Jun)) / 3
cpi_1996_2017$Q3 = ((cpi_1996_2017$Jul) + (cpi_1996_2017$Aug) + (cpi_1996_2017$Sep)) / 3
cpi_1996_2017$Q4 = ((cpi_1996_2017$Oct) + (cpi_1996_2017$Nov) + (cpi_1996_2017$Dec)) / 3
# To delete certain columns
drops = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Oct", "Nov", "Dec", "HALF1", "HALF2")
cpi_1996_2017 = cpi_1996_2017[ , !(names(cpi_1996_2017) %in% drops)]
cpi_1996_2017 = reshape(cpi_1996_2017, varying = c("Q1","Q2","Q3","Q4"), v.names = "CPI", direction = "long")
colnames(cpi_1996_2017) = c("year","Sep","quarter","CPI","id")
drops = c("id")
cpi_1996_2017 = cpi_1996_2017[ , !(names(cpi_1996_2017) %in% drops)]
CPI_b = sqldf("SELECT Sep FROM cpi_1996_2017 WHERE Year = 2017")
CPI_b = as.double(head(CPI_b, 1))
drops = c("Sep")
cpi_1996_2017 = cpi_1996_2017[ , !(names(cpi_1996_2017) %in% drops)]
# Creating new altered name for airfare so to not affect and get rid of the table 1a observations since I am merging the cpi_1996_2017 data with table_6 data only.
airfare_merged = merge(cpi_1996_2017, table_6, by = c("year", "quarter"), all.y = T)
airfare_merged$real17_fare = (airfare_merged$fare)*(CPI_b/(airfare_merged$CPI))
#6)
# We want real Q1 2017 airfare fares over time
fares_over_time = sqldf("SELECT year, real17_fare FROM airfare_merged ORDER BY year")
box_plot = ggplot(data = airfare_merged, aes(x = factor(airfare_merged$year), y = airfare_merged$real17_fare)) +
geom_boxplot() +
labs(x = "Year", y = "Real Q1 2017 Airfares in dollars ($)", title = "Airfares over time Boxplot") +
theme_few() + theme(plot.title = element_text(size = rel(1.65))) +
theme(axis.title.y = element_text(size = rel(1.77))) +
theme(axis.text.x = element_text(size = rel(1.75))) +
theme(axis.title.x = element_text(size = rel(1.85))) +
theme(axis.text.y = element_text(size = rel(1.35))) +
theme(axis.ticks.length=unit(0.25,"cm")) +
theme(axis.ticks = element_line(size = rel(2))) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm")) +
theme(strip.text.x = element_text(size = 22, colour = "red", angle = 22.5)) +
theme(legend.title = element_text(size = rel(1.65)))
box_plot
#7)
# references: http://t-redactyl.io/blog/2016/05/creating-plots-in-r-using-ggplot2-part-11-linear-regression-plots.html
# http://www.sthda.com/english/wiki/ggplot2-axis-scales-and-transformations
# Part 1: Find a relationship between fare and distance for year 2015 using an appropriate statistical model or test
# reload airfare to ensure all table values are there
relationship_15 = sqldf("SELECT year, miles, fare, `table` FROM airfare WHERE year = 2015 AND`table` LIKE '%1a%'")
model_q7 = lm(relationship_15$fare ~ relationship_15$miles)
summary(model_q7)
# p1 = plot(model_q7)
reg_plot1 = ggplot(data = relationship_15, aes(x = relationship_15$miles, y = relationship_15$fare)) +
geom_point(shape = 1) + geom_smooth(method = lm) +
labs(title = "(Table 1a) Regression Plot: 2015 Airfares ($) vs. Distance (Miles)") +
scale_x_continuous(name = "Distance (miles)", breaks = round(seq(min(relationship_15$miles) + 2, max(relationship_15$miles) + 2, by = 350),5)) +
# changing axis scales and tick amount referece: https://stackoverflow.com/questions/11335836/increase-number-of-axis-ticks
scale_y_continuous(name = "2015 Airfares ($)", breaks = round(seq(min(relationship_15$fare), max(relationship_15$fare), by = 150),0), labels = dollar) +
theme_few() + theme(plot.title = element_text(size = rel(1.49))) +
theme(axis.title.y = element_text(size = rel(1.77))) +
theme(axis.text.x = element_text(size = rel(1.75))) +
theme(axis.title.x = element_text(size = rel(1.85))) +
theme(axis.text.y = element_text(size = rel(1.35))) +
theme(axis.ticks.length=unit(0.25,"cm")) +
theme(axis.ticks = element_line(size = rel(2))) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm")) +
theme(strip.text.x = element_text(size = 22, colour = "red", angle = 22.5))
reg_plot1 = reg_plot1 + annotate("rect", xmin = 825, xmax = 1165, ymin = 985, ymax = 1065, fill="white", colour="red") +
annotate("text", x = 1000, y = 1050, label = "R^2 == 0.3115", colour = "blue", parse = T) +
annotate("text", x = 1000, y = 1025, label = "alpha == 177.5", colour = "blue", parse=T) +
annotate("text", x = 1000, y = 1000, label = "beta == 0.0587", colour = "blue", parse=T)
reg_plot1
relationship_6_15 = sqldf("SELECT year, miles, fare, `table` FROM airfare WHERE year = 2015 AND `table` LIKE '%6%'")
model_q7_6 = lm(relationship_6_15$fare ~ relationship_6_15$miles)
summary(model_q7_6)
# p2 = plot(model_q7_6)
reg_plot2 = ggplot(data = relationship_6_15, aes(x = relationship_6_15$miles, y = relationship_6_15$fare)) +
geom_point(shape = 1) + geom_smooth(method = lm) +
labs(title = "(Table 6) Regression Plot: 2015 Airfares ($) vs. Distance (Miles)") +
scale_x_continuous(name = "Distance (miles)", breaks = round(seq(min(relationship_6_15$miles) + 3, max(relationship_6_15$miles) + 3, by = 350),5)) +
# changing axis scales and tick amount referece: https://stackoverflow.com/questions/11335836/increase-number-of-axis-ticks
scale_y_continuous(name = "2015 Airfares ($)", breaks = round(seq(min(relationship_6_15$fare) - 3, max(relationship_6_15$fare) - 3, by = 150),0), labels = dollar) +
theme_few() + theme(plot.title = element_text(size = rel(1.49))) +
theme(axis.title.y = element_text(size = rel(1.77))) +
theme(axis.text.x = element_text(size = rel(1.75))) +
theme(axis.title.x = element_text(size = rel(1.85))) +
theme(axis.text.y = element_text(size = rel(1.35))) +
theme(axis.ticks.length=unit(0.25,"cm")) +
theme(axis.ticks = element_line(size = rel(2))) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm")) +
theme(strip.text.x = element_text(size = 22, colour = "red", angle = 22.5))
reg_plot2 = reg_plot2 + stat_ellipse(colour = "red") + annotate("rect", xmin = 805, xmax = 1185, ymin = 885, ymax = 965, fill="white", colour="red") +
annotate("text", x = 1000, y = 950, label = "R^2 == 0.2347", colour = "blue", parse = T) +
annotate("text", x = 1000, y = 925, label = "alpha == 210.8", colour = "blue", parse=T) +
annotate("text", x = 1000, y = 900, label = "beta == 0.0568", colour = "blue", parse=T)
reg_plot2
#8)
# Scatterplot matrices reference: https://www.statmethods.net/graphs/scatterplot.html
scatter_data1 = sqldf("SELECT fare, miles, passengers FROM airfare WHERE year = 2015 AND `table` LIKE '%1a%'")
# From library car: gives us a matrix of plots which can compare variables and their histrograms
y = scatter_data1$fare # Outcome variable
x = scatter_data1$miles # Predictor 1
z = scatter_data1$passengers # Predictor 2
# Pearson coefficient will tell us how strong the collinearity is between our two variables to determine if we should intrept the slope of y.
cor(x, z, method = "pearson")
# Model with 2 predictor variables
multi_variate1 = lm(y ~ x + z)
summary(multi_variate1)
# p3 = plot(multi_variate1)
# Model with 2 predictor variables and the interaction term.
multi_variate2 = lm(y ~ x + z + x:z)
summary(multi_variate2)
# p4 = plot(multi_variate2)
scatterplotMatrix(scatter_data1)
# Repeat above operations on table 6
scatter_data2 = sqldf("SELECT fare, miles, passengers FROM airfare WHERE year = 2015 AND `table` LIKE '%6%'")
y1 = scatter_data2$fare # Outcome variable
x1 = scatter_data2$miles # Predictor 1
z1 = scatter_data2$passengers # Predictor 2
cor(x1, z1, method = "pearson")
# Model with 2 predictor variables
multi_variate3 = lm(y1 ~ x1 + z1)
summary(multi_variate3)
# p3 = plot(multi_variate1)
# Model with 2 predictor variables and the interaction term.
multi_variate4 = lm(y1 ~ x1 + z1 + x1:z1)
summary(multi_variate4)
# p4 = plot(multi_variate2)
scatterplotMatrix(scatter_data2)
#9)
# We want 2015 pairs of cities where the carrier with largest market share has fares below the average for that city pair
# Thought Process: Select piars of cities,
# find the carrier(s) with largest mkt share that has fares below calculated avg of 2 citites mean fares
select_pairs = sqldf("SELECT city1, city2, fare, lg_carrier, lg_fare FROM table_6 WHERE year = 2015 ORDER BY lg_carrier")
answer = sqldf("SELECT * FROM select_pairs WHERE lg_fare < fare")
hist(answer$lg_fare)
hist(answer$fare)
# Closely equal variances
v1 = var(answer$lg_fare)
v2 = var(answer$fare)
# 2 sample T-test refernces: https://statistics.berkeley.edu/computing/r-t-tests
t.test(answer$fare, answer$lg_fare, alternative = "two.sided", paired = T)
ggplot(data = answer, aes(lg_fare))+
stat_density(aes(group = answer$lg_carrier, color = answer$lg_carrier), position = "identity", geom = "line")
ggplot(data = answer, aes(fare))+
stat_density(aes(group = answer$lg_carrier, color = answer$lg_carrier), position = "identity", geom = "line")
#10)
q_10_1 = sqldf("SELECT city1 as q10_CA_cities, city2 as Others, fare, miles, year FROM table_1a WHERE city1 LIKE '%Sacramento%' OR city1 LIKE '%San Francisco%' OR city1 LIKE '%Oakland%' OR city1 LIKE '%San Jose%' GROUP BY fare, city1 ORDER BY city1")
q_10_2 = sqldf("SELECT city2 as q10_CA_cities, city1 as Others, fare, miles, year FROM table_1a WHERE city2 LIKE '%Sacramento%' OR city2 LIKE '%San Francisco%' OR city2 LIKE '%Oakland%' OR city2 LIKE '%San Jose%' GROUP BY fare, city2 ORDER BY city2")
q_10_main = rbind(q_10_1, q_10_2)
# NOTE: Both San Jose and Oakland were not found in either city1 or city2 for table_1a
sqldf("SELECT city1, city2 FROM table_1a WHERE city1 LIKE '%Oakland%' OR city2 LIKE '%Oakland%'")
sqldf("SELECT city1, city2 FROM table_1a WHERE city1 LIKE '%San Jose%' OR city2 LIKE '%San Jose%'")
# Fares differeing between airports.
fare_differences = aggregate(fare ~ q10_CA_cities, q_10_main, mean)
# mean of fares shows that Sac is cheaper on average by about $10
distance_differences = aggregate(miles ~ q10_CA_cities, q_10_main, mean)
# mean distances in airports: even though Sac has slightly higher mean, below will show SF has most long distance connections.
long_dist_connect = sqldf("SELECT q10_CA_cities, Others, miles, count(miles) as Num_Connections FROM q_10_main WHERE miles = 2704 ORDER BY miles DESC")
# shows that SF has the most longest distance connections from SF to Boston MA at 2704 miles occuring 755 times.
long_dist_connect_1 = sqldf("SELECT q10_CA_cities, Others, miles, count(miles) as Num_Connections FROM q_10_main WHERE q10_CA_cities LIKE '%Sacramento%' ORDER BY miles DESC")
# shows Sac's distance connections from Sac to NY at 2553 miles with 2341 connections
# Previous (first qustions) analysis but depending upon years now:
fare_differences_1 = aggregate(fare ~ q10_CA_cities + year, q_10_main, mean)
sac = sqldf("SELECT * FROM fare_differences_1 WHERE q10_CA_cities LIKE '%Sacramento%'")
sf = sqldf("SELECT * FROM fare_differences_1 WHERE q10_CA_cities LIKE '%San Fran%'")
# plot of sac data: fare vs year
sac_plot = ggplot(data = sac, aes(x = year, y = fare)) +
geom_point(shape = 1) + geom_smooth() +
labs(title = "Sacramento Plot (table 1a): Fares ($) vs. Time (years)", x = "Year", y = "Airfares ($)") +
theme_few() + theme(plot.title = element_text(size = rel(1.49))) +
theme(axis.title.y = element_text(size = rel(1.77))) +
theme(axis.text.x = element_text(size = rel(1.75))) +
theme(axis.title.x = element_text(size = rel(1.85))) +
theme(axis.text.y = element_text(size = rel(1.35))) +
theme(axis.ticks.length=unit(0.25,"cm")) +
theme(axis.ticks = element_line(size = rel(2))) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm")) +
theme(strip.text.x = element_text(size = 22, colour = "red", angle = 22.5))
sac_plot
# plot of sf data: fare vs year
sf_plot = ggplot(data = sf, aes(x = year, y = fare)) +
geom_point(shape = 1) + geom_smooth() +
labs(title = "San Francisco Plot (table 1a): Fares ($) vs. Time (years)", x = "Year", y = "Airfares ($)") +
theme_few() + theme(plot.title = element_text(size = rel(1.49))) +
theme(axis.title.y = element_text(size = rel(1.77))) +
theme(axis.text.x = element_text(size = rel(1.75))) +
theme(axis.title.x = element_text(size = rel(1.85))) +
theme(axis.text.y = element_text(size = rel(1.35))) +
theme(axis.ticks.length=unit(0.25,"cm")) +
theme(axis.ticks = element_line(size = rel(2))) +
theme(plot.margin=unit(c(1,1,1.5,1.2),"cm")) +
theme(strip.text.x = element_text(size = 22, colour = "red", angle = 22.5))
sf_plot