-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathbig_data_script.py
More file actions
65 lines (48 loc) · 1.82 KB
/
big_data_script.py
File metadata and controls
65 lines (48 loc) · 1.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
!pip install pyspark
!pip install findspark
!pip install pandas
import findspark
findspark.init()
import pyspark
print(pyspark.__version__)
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
# Creating a spark context class
sc = SparkContext()
# Creating a spark session
spark = SparkSession \
.builder \
.appName("Python Spark DataFrames basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
spark
# Load csv into a Spark dataframe
!curl https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv >> searchterms.csv
sdf = spark.read.option("header",'True').option('delimiter', ',').csv("searchterms.csv")
# Print number of rows and columns
rows = sdf.count()
print(f"DataFrame Rows count : {rows}")
cols = len(sdf.columns)
print(f"DataFrame Columns count : {cols}")
# Print top 5 rows
sdf.show(5)
# Find out the datatype of the column searchterm
for col in sdf.dtypes:
print(col[0]+" , "+col[1])
sdf.dtypes[3][1]
# How many times was the term gaming laptop searched
sdf.filter(sdf["searchterm"] == 'gaming laptop').count()
# Print the top 5 most frequently used search terms
sdf.groupBy('searchterm').count().orderBy('count', ascending=False).show(5)
# Load the sales forecast model
!curl https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv >> searchterms.csv
# open file
file = tarfile.open('model.tar.gz')
# extracting file
file.extractall('./model')
file.close()
loaded_model = PipelineModel.load("./model/")
# Using the sales forecast model, predict the sales for the year of 2023
testing_data = sdf.filter(sdf["year"] == '2023')
predictions = loaded_model.transform(testing_data)