-
Notifications
You must be signed in to change notification settings - Fork 26
Expand file tree
/
Copy pathdataframe_api.clj
More file actions
100 lines (87 loc) · 2.03 KB
/
Copy pathdataframe_api.clj
File metadata and controls
100 lines (87 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
(ns examples.dataframe-api
(:require
[zero-one.geni.core :as g]
[zero-one.geni.test-resources :refer [melbourne-df]]))
(def dataframe (melbourne-df))
(-> dataframe
(g/group-by :Suburb)
g/count
(g/order-by (g/desc :count))
(g/limit 5)
g/show)
;;=>
;; +--------------+-----+
;; |Suburb |count|
;; +--------------+-----+
;; |Reservoir |359 |
;; |Richmond |260 |
;; |Bentleigh East|249 |
;; |Preston |239 |
;; |Brunswick |222 |
;; +--------------+-----+
(-> dataframe
(g/filter (g/like :Suburb "%South%"))
(g/select :Suburb)
g/distinct
(g/limit 5)
g/show)
;;=>
;; +----------------+
;; |Suburb |
;; +----------------+
;; |South Melbourne |
;; |South Kingsville|
;; |Clayton South |
;; |Blackburn South |
;; |Vermont South |
;; +----------------+
(-> dataframe
(g/group-by :Suburb)
(g/agg {:n (g/count "*")})
(g/order-by (g/desc :n))
(g/limit 5)
g/show)
;;=>
;; +--------------+---+
;; |Suburb |n |
;; +--------------+---+
;; |Reservoir |359|
;; |Richmond |260|
;; |Bentleigh East|249|
;; |Preston |239|
;; |Brunswick |222|
;; +--------------+---+
(-> dataframe
(g/select :Suburb :Rooms :Price)
g/print-schema)
;;=>
;; root
;; |-- Suburb: string (nullable = true)
;; |-- Rooms: long (nullable = true)
;; |-- Price: double (nullable = true)
(-> dataframe
(g/describe :Price)
g/show)
;;=>
;; +-------+-----------------+
;; |summary|Price |
;; +-------+-----------------+
;; |count |13580 |
;; |mean |1075684.079455081|
;; |stddev |639310.7242960163|
;; |min |85000.0 |
;; |max |9000000.0 |
;; +-------+-----------------+
(letfn [(null-rate [col-name]
(-> col-name
g/null?
g/double
g/mean
(g/as col-name)))]
(-> dataframe
(g/agg (map null-rate ["Car" "LandSize" "BuildingArea"]))
g/collect))
;;=>
#_({:Car 0.004565537555228277,
:LandSize 0.0,
:BuildingArea 0.47496318114874814})