added old data science folder back

mxsjoberg · Oct 28, 2023 · 92cc5cb · 92cc5cb
1 parent ea324c1
commit 92cc5cb
Show file tree

Hide file tree

Showing 15 changed files with 8,655 additions and 0 deletions.
diff --git a/Python/_Data-Science/_apache-hive.py b/Python/_Data-Science/_apache-hive.py
@@ -0,0 +1,31 @@
+import sys
+import datetime
+import pprint
+print(sys.version)
+# 2.7.13 (default, Sep 30 2017, 13:16:00) 
+
+# initialise database
+#
+# hive > CREATE DATABASE log_db;
+# hive > USE log_db;
+# hive > CREATE TABLE logs (user String, time String, query String) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;
+# hive > LOAD DATA LOCAL INPATH 'hive/query_logs.txt' OVERWRITE INTO TABLE logs;
+
+# pyhive
+from pyhive import hive
+
+#cursor = hive.connect('localhost').cursor()
+cursor = hive.connect('localhost', port=10000, auth='KERBEROS', kerberos_service_name='hive')
+cursor.execute('SHOW TABLES')
+cursor.fetchall()
+
+
+# cursor.execute('SELECT * FROM logs LIMIT 10')
+
+# print (cursor.fetchall())
+
+'''
+Michael Sjoeberg
+2020-05-13
+https://github.com/michaelsjoeberg/python-playground/blob/master/data-science/apache-hive.py
+'''
diff --git a/Python/_Data-Science/_apache-spark.py b/Python/_Data-Science/_apache-spark.py
@@ -0,0 +1,95 @@
+import sys
+import datetime
+import pprint
+print(sys.version)
+# 2.7.13 (default, Sep 30 2017, 13:16:00) 
+
+# pyspark
+from pyspark import SparkContext
+sc = SparkContext('local', 'pyspark')
+print(sc.version)
+# 2.1.2
+
+# NOTE: create output file for large outputs
+# $ python apache-spark.py > apache-spark.out
+
+# creating RDD
+fs = sc.textFile("spark/u.user")
+# print(fs.count())
+# 943
+
+# show all
+# print(fs.collect())
+
+# create sample
+sample = fs.takeSample(True, 5)
+# print(sample)
+# [u'540|28|M|engineer|91201', u'930|28|F|scientist|07310', u'736|48|F|writer|94618', u'172|55|M|marketing|22207', u'486|39|M|educator|93101']
+
+fs_sample = sc.parallelize(sample)
+# print(fs_sample.count())
+# 5
+
+# save to file
+# fs_sample.saveAsTextFile("spark/fs_sample")
+
+# apply functions to data
+# -----------------------------------------------------------
+def print_all(x): print(x)
+fs_sample.foreach(print_all)
+# 702|37|M|other|89104
+# 621|17|M|student|60402
+# 62|27|F|administrator|97214
+# 698|28|F|programmer|06906
+# 521|19|M|student|02146
+
+def print_id(x): print(x.split('|')[0])
+fs_sample.foreach(print_id)
+# 506
+# 10
+# 889
+# 798
+# 561
+
+# using parallelize to create samples before applying functions
+sc.parallelize(fs.takeOrdered(5)).foreach(print_id)
+# 100
+# 101
+# 102
+# 103
+# 104
+
+# using lambda to filter
+fs_student = fs.filter(lambda x: "student" in x)
+# print(fs_student.count())
+# 196
+
+# transformations
+# -----------------------------------------------------------
+fs_numbers = sc.parallelize([1, 2, 3, 4, 5])
+# print(fs_numbers.collect())
+# [1, 2, 3, 4, 5]
+
+fs_powers = fs_numbers.map(lambda x: x*x*x)
+# print(fs_powers.collect())
+# [1, 8, 27, 64, 125]
+
+fs_combined = fs_numbers.filter(lambda x: x >= 4).map(lambda x: x*x*x)
+# print(fs_combined.collect())
+# [64, 125]
+
+fs_words = sc.parallelize(["this is first", "this is second"]).flatMap(lambda x: x.split(" "))
+# print(fs_words.collect())
+# ['this', 'is', 'first', 'this', 'is', 'second']
+
+# union
+print(fs_numbers.union(sc.parallelize([3, 4, 5, 6, 7])).distinct().collect())
+# [2, 4, 6, 1, 3, 5, 7]
+
+# intersection
+print(fs_numbers.intersection(sc.parallelize([3, 4, 5, 6, 7])).distinct().collect())
+# [4, 3, 5]
+
+# cartesian
+print(fs_numbers.cartesian(sc.parallelize([1, 2])).collect())
+# [(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (4, 1), (4, 2), (5, 1), (5, 2)]
diff --git a/Python/_Data-Science/_bloomfilter.py b/Python/_Data-Science/_bloomfilter.py
@@ -0,0 +1,30 @@
+# https://bitbucket.org/xmonader/pybloomfilter/src/default/
+import sys
+import datetime
+import pprint
+print(sys.version)
+# 2.7.13 (default, Sep 30 2017, 13:16:00) 
+
+# pybloomfilter
+from pybloomfilter import BloomFilter
+
+bf = BloomFilter(6000, 0.01)
+
+first_words = []
+
+with open("bloomfilter/spam_websites.txt") as file:
+ for word in file:
+ if (len(first_words) < 514):
+ first_words.append(word.strip())
+ else:
+ bf.add(str.encode(word.strip()))
+
+for word in first_words:
+ if (str.encode(word) in bf):
+ print(('{} : false positive').format(word))
+
+'''
+Michael Sjoeberg
+2020-05-13
+https://github.com/michaelsjoeberg/python-playground/blob/master/data-science/bloomfilter.py
+'''