-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpandas_overview.py
More file actions
105 lines (56 loc) · 2.47 KB
/
Copy pathpandas_overview.py
File metadata and controls
105 lines (56 loc) · 2.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
"""Pandas_overview.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1mHyirrg5QqAGk4aQcnUJ7xMOMJk25-z_
"""
import pandas as pd #Remeber Pandas is directly built off Numpy so many features are the same.
df = pd.read_csv('salaries.csv')
df
df['Salary']
df[['Salary','Name']] #Notice two brackets because it's passing a list.
df['Salary'].max()
df.describe()
df['Salary'] > 6000 #Boolean
#If I want the entries themselves rather than just the Boolean values
my_filter = df['Salary'] > 600
df
df[df['Salary'] > 600] #This is a better way to write the same code as above this is aka masking.
"""## Pandas built-in data visualization"""
# Commented out IPython magic to ensure Python compatibility.
import numpy as np
# %matplotlib inline
df1 = pd.read_csv('df1', index_col =0)
df1.head() #notice this is a timeseries data, look at col 0
df2 = pd.read_csv('df2')
df2 #just a normal dataset
#Drawing histogram of all the values in df1
df1['A'].hist() #Notice underneath it calls for matplotlib so you can also use matplotlib arguments
df1['A'].hist(bins=30)
import seaborn as sns #makes the histogram prettier, let's rerun
df1['A'].hist(bins=30)
df1['A'].plot(kind = 'hist', bins = 20)
df1['A'].plot.hist() #produces same result as above but much cleaner
df2.plot.area(alpha =0.4) #alpha just makes the graph translucent ish.. no alpha makes the graph darker
df2.plot.bar() #it takes index values as categories..
df2.plot.bar(stacked=True)
df1.plot.line()
df1.plot.line(figsize=(12,3), lw=1) #lw is the line width
"""# Scatter plots"""
df1.plot.scatter(x ='A', y ='B')
df1.plot.scatter(x ='A', y ='B', c ='C') #This a 3d plot
df1.plot.scatter(x ='A', y ='B', c ='C', cmap = 'coolwarm') #This a 3d plot
df1.plot.scatter(x ='A', y ='B', c ='C', s =df1['C'])
df1.plot.scatter(x ='A', y ='B', c ='C', s =df1['C']*100) #multiply c by 100 to increase the size
"""# Box plot"""
df2.plot.box()
#Hexagonal bin plot
df = pd.DataFrame(np.random.randn(1000,2), columns = ['a','b'])
df.head()
df.plot.hexbin(x='a', y='b')
df.plot.hexbin(x='a', y='b', gridsize = 25) #increases the bin size, hexagons get darker as there is more points inside of them.
df.plot.hexbin(x='a', y='b', gridsize = 25, cmap = 'coolwarm')
"""# Kernel density estimation plots"""
df2['a'].plot.kde() #Kernel Density Estimation of df2 a column.
df2['a'].plot.density()
df2.plot.kde() #Kernel Density Estimation of the entire dataframe