-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scala-spark.html
412 lines (367 loc) · 28.3 KB
/
scala-spark.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta content="width=device-width, initial-scale=1.0" name="viewport">
<title>Scala Spark</title>
<meta content="" name="description">
<meta content="" name="keywords">
<!-- Favicons -->
<link href="assets/img/Favicon-1.png" rel="icon">
<link href="assets/img/Favicon-1.png" rel="apple-touch-icon">
<!-- Google Fonts -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,500,500i,600,600i,700,700i|Poppins:300,300i,400,400i,500,500i,600,600i,700,700i" rel="stylesheet">
<!-- Vendor CSS Files -->
<link href="assets/vendor/aos/aos.css" rel="stylesheet">
<link href="assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="assets/vendor/bootstrap-icons/bootstrap-icons.css" rel="stylesheet">
<link href="assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
<link href="assets/vendor/glightbox/css/glightbox.min.css" rel="stylesheet">
<link href="assets/vendor/swiper/swiper-bundle.min.css" rel="stylesheet">
<!-- Creating a python code section-->
<link rel="stylesheet" href="assets/css/prism.css">
<script src="assets/js/prism.js"></script>
<!-- Template Main CSS File -->
<link href="assets/css/style.css" rel="stylesheet">
<!-- To set the icon, visit https://fontawesome.com/account-->
<script src="https://kit.fontawesome.com/5d25c1efd3.js" crossorigin="anonymous"></script>
<!-- end of icon-->
<script type="text/javascript" async
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
</script>
<!-- =======================================================
* Template Name: iPortfolio
* Updated: Sep 18 2023 with Bootstrap v5.3.2
* Template URL: https://bootstrapmade.com/iportfolio-bootstrap-portfolio-websites-template/
* Author: BootstrapMade.com
* License: https://bootstrapmade.com/license/
======================================================== -->
</head>
<body>
<!-- ======= Mobile nav toggle button ======= -->
<i class="bi bi-list mobile-nav-toggle d-xl-none"></i>
<!-- ======= Header ======= -->
<header id="header">
<div class="d-flex flex-column">
<div class="profile">
<img src="assets/img/myphoto.jpeg" alt="" class="img-fluid rounded-circle">
<h1 class="text-light"><a href="index.html">Arun</a></h1>
<div class="social-links mt-3 text-center">
<a href="https://www.linkedin.com/in/arunp77/" target="_blank" class="linkedin"><i class="bx bxl-linkedin"></i></a>
<a href="https://github.com/arunp77" target="_blank" class="github"><i class="bx bxl-github"></i></a>
<a href="https://twitter.com/arunp77_" target="_blank" class="twitter"><i class="bx bxl-twitter"></i></a>
<a href="https://www.instagram.com/arunp77/" target="_blank" class="instagram"><i class="bx bxl-instagram"></i></a>
<a href="https://arunp77.medium.com/" target="_blank" class="medium"><i class="bx bxl-medium"></i></a>
</div>
</div>
<nav id="navbar" class="nav-menu navbar">
<ul>
<li><a href="index.html#hero" class="nav-link scrollto active"><i class="bx bx-home"></i> <span>Home</span></a></li>
<li><a href="index.html#about" class="nav-link scrollto"><i class="bx bx-user"></i> <span>About</span></a></li>
<li><a href="index.html#resume" class="nav-link scrollto"><i class="bx bx-file-blank"></i> <span>Resume</span></a></li>
<li><a href="index.html#portfolio" class="nav-link scrollto"><i class="bx bx-book-content"></i> <span>Portfolio</span></a></li>
<li><a href="index.html#skills-and-tools" class="nav-link scrollto"><i class="bx bx-wrench"></i> <span>Skills and Tools</span></a></li>
<!-- <li><a href="index.html#services" class="nav-link scrollto"><i class="bx bx-server"></i> <span>Services</span></a></li>-->
<li><a href="index.html#professionalcourses" class="nav-link scrollto"><i class="bx bx-book-alt"></i> <span>Professional Certification</span></a></li>
<li><a href="index.html#publications" class="nav-link scrollto"><i class="bx bx-news"></i> <span>Publications</span></a></li>
<li><a href="index.html#extra-curricular" class="nav-link scrollto"><i class="bx bx-rocket"></i> <span>Extra-Curricular Activities</span></a></li>
<li><a href="index.html#contact" class="nav-link scrollto"><i class="bx bx-envelope"></i> <span>Contact</span></a></li>
</ul>
</nav><!-- .nav-menu -->
</div>
</header><!-- End Header -->
<main id="main">
<!-- ======= Breadcrumbs ======= -->
<section id="breadcrumbs" class="breadcrumbs">
<div class="container">
<div class="d-flex justify-content-between align-items-center">
<h2>Data Engineering</h2>
<ol>
<li><a href="Data-engineering.html" class="clickable-box">Content section</a></li>
<li><a href="index.html#portfolio" class="clickable-box">Portfolio section</a></li>
</ol>
</div>
</div>
</section><!-- End Breadcrumbs -->
<!------ right dropdown menue ------->
<div class="right-side-list">
<div class="dropdown">
<button class="dropbtn"><strong>Shortcuts:</strong></button>
<div class="dropdown-content">
<ul>
<li><a href="cloud-compute.html"><i class="fas fa-cloud"></i> Cloud</a></li>
<li><a href="AWS-GCP.html"><i class="fas fa-cloud"></i> AWS-GCP</a></li>
<li><a href="amazon-s3.html"><i class="fas fa-cloud"></i> AWS S3</a></li>
<li><a href="ec2-confi.html"><i class="fas fa-server"></i> EC2</a></li>
<li><a href="Docker-Container.html"><i class="fab fa-docker" style="color: rgb(29, 27, 27);"></i> Docker</a></li>
<li><a href="Jupyter-nifi.html"><i class="fab fa-python" style="color: rgb(34, 32, 32);"></i> Jupyter-nifi</a></li>
<li><a href="snowflake-task-stream.html"><i class="fas fa-snowflake"></i> Snowflake</a></li>
<li><a href="data-model.html"><i class="fas fa-database"></i> Data modeling</a></li>
<li><a href="sql-basics.html"><i class="fas fa-table"></i> QL</a></li>
<li><a href="sql-basic-details.html"><i class="fas fa-database"></i> SQL</a></li>
<li><a href="Bigquerry-sql.html"><i class="fas fa-database"></i> Bigquery</a></li>
<li><a href="scd.html"><i class="fas fa-archive"></i> SCD</a></li>
<li><a href="sql-project.html"><i class="fas fa-database"></i> SQL project</a></li>
<!-- Add more subsections as needed -->
</ul>
</div>
</div>
</div>
<!-- ======= Portfolio Details Section ======= -->
<section id="portfolio-details" class="portfolio-details">
<div class="container">
<div class="row gy-4">
<h1>Scala Spark</h1>
<div class="col-lg-8">
<div class="portfolio-details-slider swiper">
<div class="swiper-wrapper align-items-center">
<figure>
<img src="assets/img/data-engineering/scala-spark.png" alt="" style="max-width: 70%; max-height: auto;">
<figcaption></figcaption>
</figure>
</div>
<div class="swiper-pagination"></div>
</div>
</div>
<div class="col-lg-4 grey-box">
<div class="section-title">
<h3>Table of Contents</h3>
<ol>
<li><a href="#introduction">Introduction</a></li>
<li><a href="#objective">Objective</a></li>
<ul>
<li><a href="#buildsparksession">Building the SparkSession and loading the datasets</a></li>
<li><a href="#select-categorical">Selecting the Categorical variables</a></li>
<li><a href="#pipelines">Pipelines</a></li>
<li><a href="#svmlib">Formatting the database in svmlib format</a></li>
<li><a href="#application">Application of a Spark ML classifier</a></li>
<li><a href="#model">Model prediction</a></li>
<li><a href="#model-evaluate">Model evaluation</a></li>
</ul>
<li><a href="#reference">Reference</a></li>
</ol>
</div>
</div>
</div>
<section>
<h3 id="introduction">Introduction to Scala</h3>
Scala is a versatile programming language that combines object-oriented and functional programming paradigms. It runs on the <a href="https://docs.oracle.com/en/java/javase/21/vm/java-virtual-machine-technology-overview.html" target="_blank">Java Virtual Machine (JVM)</a> and is known for
its concise syntax, strong static typing, and compatibility with Java libraries. Scala is commonly used in various domains including data science, big data processing
(with frameworks like Apache Spark), web development, and more. It offers features like pattern matching, immutability, higher-order functions, and type inference,
making it a powerful tool for building scalable and maintainable applications.
<p>Scala is a modern, multi-paradigm programming language designed to address some of the perceived shortcomings of Java. Here's what makes Scala special:</p>
<ol>
<li><strong>Object-Oriented and Functional Powerhouse: </strong>
<ul>
<li><strong>Everything is an Object: </strong>Like Java, Scala is firmly object-oriented (OO). Classes, inheritance, and methods are familiar constructs.</li>
<li><strong>Functions Rule: </strong>Scala takes it a step further by treating functions as first-class citizens. You can pass functions as arguments, return functions from other functions, encouraging composable code.</li>
</ul>
</li>
<li><strong>Conciseness and Expressivity: </strong>
<ul>
<li><strong>Less Boilerplate: </strong>Scala eliminates a lot of Java's verbosity. Features like type inference and powerful data structures allow you to do more with fewer lines of code.</li>
<li><strong>Pattern Matching: </strong>Elegant way to deconstruct data and make decisions, replacing lengthy if-else chains.</li>
</ul>
</li>
<li><strong>Static Typing and Safety: </strong>
<ul>
<li><strong>Errors Caught Early: </strong>The compiler rigorously checks types, drastically reducing runtime surprises.</li>
<li><strong>Immutability by Default: </strong> Scala encourages side-effect-free coding through immutable variables and data structures, promoting safer and more reliable programs.</li>
</ul>
</li>
<li><strong>Powerful Concurrency Support: </strong>
<ul>
<li><strong>Actor Model: </strong>Scala-based libraries like Akka streamline building concurrent applications using actors (small units of computation that communicate via messages).</li>
<li><strong>Futures and Promises: </strong>Convenient abstractions for handling asynchronous operations. </li>
</ul>
</li>
<li><strong>Rich Ecosystem: </strong>
<ul>
<li><strong>Play Framework: </strong>Popular web framework for building RESTful APIs and dynamic web applications.</li>
<li><strong>Apache Kafka: </strong>Stream processing platform often used in conjunction with Spark.</li>
</ul>
</li>
</ol>
<!------------------------------------->
<h4 id="scala-shines">Where Scala Shines</h4>
<p>Scala is designed with the following goals in mind:</p>
<ul class="standard">
<li>To express common programming patterns using as little code as possible.</li>
<li>To write software that runs at least as fast as Java, but often much faster.</li>
<li>To make it easy to add new features to existing code bases without modifying hundreds of files.</li>
<li>To make it easier to reason about correctness by providing compile-time safety checks.</li>
<li>To make it easier to reason about correctness by providing compile-time safety.</li>
<li>To make it easy to add new features to existing systems without modifying them.</li>
<li>To make it easy to add new features to an existing codebase without modifying hundreds of files.</li>
<li>To make it easy to add new features to an existing codebase without modifying existing code.</li>
<li>To make it easy to add new features to an existing codebase without tearing everything apart.</li>
<li>To make it easy to add new features to existing systems without tearing them apart. This means making it easier for developers to reuse code from other parts of a system or
<li>To make it easy to add new features to an existing codebase without tearing everything apart. This means you can refactor your legacy Java application into a more modular and
<li>To make it easy to add new features to existing code bases without modifying thousands of lines of code.</li>
<li>To make it easy to add new features to an existing codebase without modifying hundreds of files. This means you can refactor your legacy Java code into a more maintainable structure
<li>To make it easy to add new features to existing code bases without te aring everything apart. This means that you can refactor your legacy Java code base into a more functional style
<li>To make it easy to add new features to existing code bases without tearing everything apart. This means that you can gradually refactor your legacy code base into a more functional style
<li>To have a concise syntax that makes it easy to read and write.</li>
<li>To catch type errors at compile time rather than at runtime.</li>
<li>To provide robust support for functional programming constructs such as higher-order
<li>To support functional programming constructs while still embracing object-oriented design.</li>
<li>To integrate well with existing Javacodebase. This means you can use any existing Java library from within your Scala code without having to write new Java wrappers for them.</li>
<li>To be easily embedded into existing Java applications.</li>
</ul>
<!--------------------------------->
<h3 id="getting-started">Getting Started with Scala</h3>
<h5 id="installation">Installation</h5>
<ol>
<li><strong>Visit the Download Page: </strong> Head over to <a href="https://www.scala-lang.org/download/" target="_blank">https://www.scala-lang.org/download/</a> and you'll see different options:
<ul>
<li><strong>IDE (Recommended for beginners): </strong>An IDE includes the Scala compiler, build tools, an editor, debugger, and more. IntelliJ IDEA with the Scala plugin is a popular choice.</li>
<li><strong>Standalone Installer: </strong>Provides just the core Scala compiler and tools. Useful for advanced users or experimenting on the command line.</li>
</ul>
</li>
<li><strong>Follow Instructions: </strong>The download page offers instructions for Windows, macOS, and Linux.</li>
</ol>
<h5 id="important-notes">Important Notes</h5>
<ul>
<li><strong>Java is Required: </strong>Scala code compiles to Java bytecode and runs on the Java Virtual Machine (JVM). Make sure you have an appropriate
<a href="https://docs.oracle.com/en/java/javase/21/install/overview-jdk-installation.html" target="_blank">Java Development Kit (JDK)</a> installed.</li>
<li><strong>IDE Convenience: </strong> While you can install Scala and write code with a simple text editor, an IDE makes things tremendously easier with auto-completion, code navigation, and debugging capabilities.</li>
</ul>
<h5 id="fundamentals">Fundamentals: The Building Blocks</h5>
Here's a summary of essential concepts to grasp right at the beginning:
<ol>
<li><strong>Variables: </strong>
<ul>
<li><code>val</code>: for immutable values (cannot be changed once assigned).</li>
<li><code>var</code>: for mutable values (can be reassigned).</li>
</ul>
</li>
<li><strong>Data Types: </strong>Scala provides the familiar numeric types (<code>Int</code>, <code>Double</code>, etc.), booleans (<code>Boolean</code>), strings (<code>String</code>), and more.</li>
<li><strong>Functions: </strong>Define reusable blocks of code using the <code>def</code> keyword, specifying name, parameters, and return type.</li>
<li><strong>Classes and Objectives: </strong>
<ul>
<li>Classes are blueprints for creating objects.</li>
<li>Objects are instances of classes, holding data and behavior.</li>
</ul>
</li>
<li><strong>Traits: </strong>Similar to interfaces in Java, they define abstract methods and fields. A class can extend multiple traits.</li>
<li><strong>Expressions: </strong>Any piece of code that evaluates to a value.</li>
</ol>
</section>
<section>
<h3 id="file-format">File formats: CSV, Parquet, ORC and AVRO</h3>
The selection of a file format profoundly influences the performance and manageability of a data system. As a result, various open-source solutions have emerged to
efficiently store data. Among these, popular storage formats include JSON, Apache Avro, Apache Parquet, Apache ORC, Apache Arrow, and traditional delimited text
files like CSV. Each format entails tradeoffs regarding factors such as flexibility, software compatibility, efficiency, and performance.
<ol>
<li><strong>JSON (JavaScript Object Notation):</strong>
<ul>
<li>JSON is a lightweight and human-readable data interchange format.</li>
<li>It is commonly used for transmitting data between a server and a web application.</li>
<li>JSON is text-based and easy to parse, making it popular for web APIs and configuration files.</li>
<li>However, JSON files can be larger and less efficient compared to binary formats for large datasets.</li>
</ul>
</li>
<li><strong>Apache Avro:</strong>
<ul>
<li>Avro is a binary serialization format developed within the Apache Hadoop project.</li>
<li>It provides a compact, fast, and efficient data serialization framework.</li>
<li>Avro supports schema evolution, allowing for changes to the schema without breaking compatibility with older data.</li>
<li>It is widely used in the Hadoop ecosystem, especially with tools like Apache Kafka and Apache Spark.</li>
</ul>
</li>
<li><strong>Apache Parquet:</strong>
<ul>
<li>Parquet is a columnar storage format optimized for use with big data processing frameworks.</li>
<li>It stores data column-wise rather than row-wise, which improves compression and query performance.</li>
<li>Parquet is efficient for analytics workloads, especially when dealing with large datasets.</li>
<li>It is commonly used with Apache Hadoop, Apache Spark, and other big data tools.</li>
</ul>
</li>
<li><strong>Apache ORC (Optimized Row Columnar):</strong>
<ul>
<li>ORC is another columnar storage format developed within the Apache Hive project.</li>
<li>Similar to Parquet, ORC organizes data by column rather than by row for improved compression and query performance.</li>
<li>ORC supports advanced features like predicate pushdown and stripe-level statistics, enhancing query performance further.</li>
<li>It is often used in data warehousing and analytics applications within the Hadoop ecosystem.</li>
</ul>
</li>
<li><strong>Apache Arrow:</strong>
<ul>
<li>Arrow is a cross-language development platform for in-memory data.</li>
<li>It provides a standardized language-independent columnar memory format for data interchange between different systems.</li>
<li>Arrow enables efficient data sharing and interoperability between various programming languages and analytical tools.</li>
<li>It is particularly beneficial for high-performance analytics and machine learning applications.</li>
</ul>
</li>
<li><strong>CSV (Comma-Separated Values):</strong>
<ul>
<li>CSV is a simple and widely used file format for tabular data.</li>
<li>It stores data in plain text format with each record separated by a delimiter, commonly a comma.</li>
<li>CSV files are human-readable and widely supported by spreadsheet applications and databases.</li>
<li>However, they lack advanced features like schema enforcement and efficient compression compared to more modern formats like Parquet or Avro.</li>
</ul>
</li>
</ol>
</section>
<!-------Reference ------->
<section id="reference">
<h3>References</h3>
<ol>
<li><a href="https://spark.apache.org/documentation.html" target="_blank"> Official Documentation</a></li>
<li><a href="https://www.databricks.com/learn/training/login" target="_blank">Databricks Learning Academy</a></li>
<li><a href="https://sparkbyexamples.com/" target="_blank">Spark by Examples</a></li>
<li><a href="https://www.datacamp.com/tutorial/pyspark-tutorial-getting-started-with-pyspark" target="_blank">Datacamp tutorial</a>.</li>
<li>For databricks, you can look at tutorial videos on youtube at <a href="https://www.youtube.com/watch?v=ChISx0-cMpU" target="_blank">youtube video by Bryan Cafferky</a>,
writer of the book "Master Azure Databricks". A great playlist for someone who just want to learn about the big data analytics at Databricks Azure cloud platform.</li>
<li>See the video for <a href="https://www.youtube.com/watch?v=_C8kWso4ne4" target="_blank">pyspark basics by Krish Naik</a>. Great video for starter.</li>
<li><a href="https://www.youtube.com/watch?v=QLGrLFOzMRw" target="_blank">Great youtube on Apache spark</a> one premise working.</li>
</ol>
</section>
<hr>
<div style="background-color: #f0f0f0; padding: 15px; border-radius: 5px;">
<h3>Some other interesting things to know:</h3>
<ul style="list-style-type: disc; margin-left: 30px;">
<li>Visit my website on <a href="sql-project.html">For Data, Big Data, Data-modeling, Datawarehouse, SQL, cloud-compute.</a></li>
<li>Visit my website on <a href="Data-engineering.html">Data engineering</a></li>
</ul>
</div>
<p></p>
<div class="navigation">
<a href="index.html#portfolio" class="clickable-box">
<span class="arrow-left">Portfolio section</span>
</a>
<a href="Data-engineering.html" class="clickable-box">
<span class="arrow-right">Content</span>
</a>
</div>
</div>
</section><!-- End Portfolio Details Section -->
</main><!-- End #main --
<!-- ======= Footer ======= -->
<footer id="footer">
<div class="container">
<div class="copyright">
© Copyright <strong><span>Arun</span></strong>
</div>
</div>
</footer><!-- End Footer -->
<a href="#" class="back-to-top d-flex align-items-center justify-content-center"><i class="bi bi-arrow-up-short"></i></a>
<!-- Vendor JS Files -->
<script src="assets/vendor/purecounter/purecounter_vanilla.js"></script>
<script src="assets/vendor/aos/aos.js"></script>
<script src="assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<script src="assets/vendor/glightbox/js/glightbox.min.js"></script>
<script src="assets/vendor/isotope-layout/isotope.pkgd.min.js"></script>
<script src="assets/vendor/swiper/swiper-bundle.min.js"></script>
<script src="assets/vendor/typed.js/typed.umd.js"></script>
<script src="assets/vendor/waypoints/noframework.waypoints.js"></script>
<script src="assets/vendor/php-email-form/validate.js"></script>
<!-- Template Main JS File -->
<script src="assets/js/main.js"></script>
<script>
document.addEventListener("DOMContentLoaded", function () {
hljs.initHighlightingOnLoad();
});
</script>
</body>
</html>