Python_projects_and_more/pandas_overview.py at master · crosstherubicon/Python_projects_and_more · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
"""Pandas_overview.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1mHyirrg5QqAGk4aQcnUJ7xMOMJk25-z_
"""

import pandas as pd #Remeber Pandas is directly built off Numpy so many features are the same.

df = pd.read_csv('salaries.csv')

df

df['Salary']

df[['Salary','Name']] #Notice two brackets because it's passing a list.

df['Salary'].max()

df.describe()

df['Salary'] > 6000 #Boolean

#If I want the entries themselves rather than just the Boolean values

my_filter = df['Salary'] > 600

df

df[df['Salary'] > 600] #This is a better way to write the same code as above this is aka masking.

"""## Pandas built-in data visualization"""

# Commented out IPython magic to ensure Python compatibility.
import numpy as np
# %matplotlib inline

df1 = pd.read_csv('df1', index_col =0)
df1.head() #notice this is a timeseries data, look at col 0

df2 = pd.read_csv('df2')
df2 #just a normal dataset

#Drawing histogram of all the values in df1

df1['A'].hist() #Notice underneath it calls for matplotlib so you can also use matplotlib arguments

df1['A'].hist(bins=30)

import seaborn as sns #makes the histogram prettier, let's rerun
df1['A'].hist(bins=30)

df1['A'].plot(kind = 'hist', bins = 20)

df1['A'].plot.hist() #produces same result as above but much cleaner

df2.plot.area(alpha =0.4) #alpha just makes the graph translucent ish.. no alpha makes the graph darker

df2.plot.bar() #it takes index values as categories..

df2.plot.bar(stacked=True)

df1.plot.line()

df1.plot.line(figsize=(12,3), lw=1) #lw is the line width

"""# Scatter plots"""

df1.plot.scatter(x ='A', y ='B')

df1.plot.scatter(x ='A', y ='B', c ='C') #This a 3d plot

df1.plot.scatter(x ='A', y ='B', c ='C', cmap = 'coolwarm') #This a 3d plot

df1.plot.scatter(x ='A', y ='B', c ='C', s =df1['C'])

df1.plot.scatter(x ='A', y ='B', c ='C', s =df1['C']*100) #multiply c by 100 to increase the size

"""# Box plot"""

df2.plot.box()

#Hexagonal bin plot

df = pd.DataFrame(np.random.randn(1000,2), columns = ['a','b'])
df.head()

df.plot.hexbin(x='a', y='b')

df.plot.hexbin(x='a', y='b', gridsize = 25) #increases the bin size, hexagons get darker as there is more points inside of them.

df.plot.hexbin(x='a', y='b', gridsize = 25, cmap = 'coolwarm')

"""# Kernel density estimation plots"""

df2['a'].plot.kde() #Kernel Density Estimation of df2 a column.


df2['a'].plot.density()

df2.plot.kde() #Kernel Density Estimation of the entire dataframe