Skip to content

Commit 2449e0b

Browse files
committed
fixed the issue with the colon and win
1 parent 8ba15d9 commit 2449e0b

1 file changed

Lines changed: 282 additions & 16 deletions

File tree

6-Integration-With-Pandas.ipynb

Lines changed: 282 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
},
3636
{
3737
"cell_type": "code",
38-
"execution_count": null,
38+
"execution_count": 27,
3939
"metadata": {
4040
"collapsed": false,
4141
"deletable": true,
@@ -49,7 +49,7 @@
4949
},
5050
{
5151
"cell_type": "code",
52-
"execution_count": null,
52+
"execution_count": 28,
5353
"metadata": {
5454
"collapsed": false,
5555
"deletable": true,
@@ -82,13 +82,29 @@
8282
},
8383
{
8484
"cell_type": "code",
85-
"execution_count": null,
85+
"execution_count": 29,
8686
"metadata": {
8787
"collapsed": false,
8888
"deletable": true,
8989
"editable": true
9090
},
91-
"outputs": [],
91+
"outputs": [
92+
{
93+
"name": "stderr",
94+
"output_type": "stream",
95+
"text": [
96+
"<string>:2: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.\n"
97+
]
98+
},
99+
{
100+
"name": "stdout",
101+
"output_type": "stream",
102+
"text": [
103+
"CPU times: user 15.5 s, sys: 1.03 s, total: 16.5 s\n",
104+
"Wall time: 16.7 s\n"
105+
]
106+
}
107+
],
92108
"source": [
93109
"%%time\n",
94110
"df = pd.read_csv('pokemon/300k_csv.zip')\n",
@@ -101,27 +117,253 @@
101117
},
102118
{
103119
"cell_type": "code",
104-
"execution_count": null,
120+
"execution_count": 30,
105121
"metadata": {
106122
"collapsed": false,
107123
"deletable": true,
108124
"editable": true
109125
},
110-
"outputs": [],
126+
"outputs": [
127+
{
128+
"name": "stdout",
129+
"output_type": "stream",
130+
"text": [
131+
"<class 'pandas.core.frame.DataFrame'>\n",
132+
"RangeIndex: 296021 entries, 0 to 296020\n",
133+
"Columns: 208 entries, pokemonId to target\n",
134+
"dtypes: bool(168), float64(8), int64(17), object(8), uint64(7)\n",
135+
"memory usage: 137.8+ MB\n"
136+
]
137+
}
138+
],
111139
"source": [
112140
"df.info()"
113141
]
114142
},
115143
{
116144
"cell_type": "code",
117-
"execution_count": null,
145+
"execution_count": 31,
118146
"metadata": {
119147
"collapsed": false,
120148
"deletable": true,
121149
"editable": true,
122150
"scrolled": true
123151
},
124-
"outputs": [],
152+
"outputs": [
153+
{
154+
"data": {
155+
"text/html": [
156+
"<div>\n",
157+
"<style>\n",
158+
" .dataframe thead tr:only-child th {\n",
159+
" text-align: right;\n",
160+
" }\n",
161+
"\n",
162+
" .dataframe thead th {\n",
163+
" text-align: left;\n",
164+
" }\n",
165+
"\n",
166+
" .dataframe tbody tr th {\n",
167+
" vertical-align: top;\n",
168+
" }\n",
169+
"</style>\n",
170+
"<table border=\"1\" class=\"dataframe\">\n",
171+
" <thead>\n",
172+
" <tr style=\"text-align: right;\">\n",
173+
" <th></th>\n",
174+
" <th>pokemonId</th>\n",
175+
" <th>latitude</th>\n",
176+
" <th>longitude</th>\n",
177+
" <th>appearedLocalTime</th>\n",
178+
" <th>_id</th>\n",
179+
" <th>cellId_90m</th>\n",
180+
" <th>cellId_180m</th>\n",
181+
" <th>cellId_370m</th>\n",
182+
" <th>cellId_730m</th>\n",
183+
" <th>cellId_1460m</th>\n",
184+
" <th>...</th>\n",
185+
" <th>cooc_143</th>\n",
186+
" <th>cooc_144</th>\n",
187+
" <th>cooc_145</th>\n",
188+
" <th>cooc_146</th>\n",
189+
" <th>cooc_147</th>\n",
190+
" <th>cooc_148</th>\n",
191+
" <th>cooc_149</th>\n",
192+
" <th>cooc_150</th>\n",
193+
" <th>cooc_151</th>\n",
194+
" <th>target</th>\n",
195+
" </tr>\n",
196+
" </thead>\n",
197+
" <tbody>\n",
198+
" <tr>\n",
199+
" <th>0</th>\n",
200+
" <td>16</td>\n",
201+
" <td>20.525745</td>\n",
202+
" <td>-97.460829</td>\n",
203+
" <td>2016-09-08T03:57:45</td>\n",
204+
" <td>NTgxMDkzOTk4MTM5MjUwMjIzNw==</td>\n",
205+
" <td>9645139108510564000</td>\n",
206+
" <td>9645139108711890000</td>\n",
207+
" <td>9645139108443455000</td>\n",
208+
" <td>9645139109517197000</td>\n",
209+
" <td>9645139113812165000</td>\n",
210+
" <td>...</td>\n",
211+
" <td>False</td>\n",
212+
" <td>False</td>\n",
213+
" <td>False</td>\n",
214+
" <td>False</td>\n",
215+
" <td>False</td>\n",
216+
" <td>False</td>\n",
217+
" <td>False</td>\n",
218+
" <td>False</td>\n",
219+
" <td>False</td>\n",
220+
" <td>16</td>\n",
221+
" </tr>\n",
222+
" <tr>\n",
223+
" <th>1</th>\n",
224+
" <td>133</td>\n",
225+
" <td>20.523695</td>\n",
226+
" <td>-97.461167</td>\n",
227+
" <td>2016-09-08T03:57:37</td>\n",
228+
" <td>OTQ1NDgzODc1MjM3NDEzMTI2MQ==</td>\n",
229+
" <td>9645139109852742000</td>\n",
230+
" <td>9645139109785633000</td>\n",
231+
" <td>9645139110590940000</td>\n",
232+
" <td>9645139109517197000</td>\n",
233+
" <td>9645139113812165000</td>\n",
234+
" <td>...</td>\n",
235+
" <td>False</td>\n",
236+
" <td>False</td>\n",
237+
" <td>False</td>\n",
238+
" <td>False</td>\n",
239+
" <td>False</td>\n",
240+
" <td>False</td>\n",
241+
" <td>False</td>\n",
242+
" <td>False</td>\n",
243+
" <td>False</td>\n",
244+
" <td>133</td>\n",
245+
" </tr>\n",
246+
" <tr>\n",
247+
" <th>2</th>\n",
248+
" <td>16</td>\n",
249+
" <td>38.903590</td>\n",
250+
" <td>-77.199780</td>\n",
251+
" <td>2016-09-08T03:57:25</td>\n",
252+
" <td>NTQ0OTQ0NDA1Nzg2ODg3OTg2OQ==</td>\n",
253+
" <td>9923201472785285000</td>\n",
254+
" <td>9923201472986612000</td>\n",
255+
" <td>9923201473791918000</td>\n",
256+
" <td>9923201477013144000</td>\n",
257+
" <td>9923201481308110000</td>\n",
258+
" <td>...</td>\n",
259+
" <td>False</td>\n",
260+
" <td>False</td>\n",
261+
" <td>False</td>\n",
262+
" <td>False</td>\n",
263+
" <td>False</td>\n",
264+
" <td>False</td>\n",
265+
" <td>False</td>\n",
266+
" <td>False</td>\n",
267+
" <td>False</td>\n",
268+
" <td>16</td>\n",
269+
" </tr>\n",
270+
" <tr>\n",
271+
" <th>3</th>\n",
272+
" <td>13</td>\n",
273+
" <td>47.665903</td>\n",
274+
" <td>-122.312561</td>\n",
275+
" <td>2016-09-08T03:56:22</td>\n",
276+
" <td>NTU2MTU1NDM4NzA2MDk1MDcxNw==</td>\n",
277+
" <td>6093392705025474600</td>\n",
278+
" <td>6093392705092583400</td>\n",
279+
" <td>6093392705897889800</td>\n",
280+
" <td>6093392702676664300</td>\n",
281+
" <td>6093392715561566200</td>\n",
282+
" <td>...</td>\n",
283+
" <td>False</td>\n",
284+
" <td>False</td>\n",
285+
" <td>False</td>\n",
286+
" <td>False</td>\n",
287+
" <td>False</td>\n",
288+
" <td>False</td>\n",
289+
" <td>False</td>\n",
290+
" <td>False</td>\n",
291+
" <td>False</td>\n",
292+
" <td>13</td>\n",
293+
" </tr>\n",
294+
" <tr>\n",
295+
" <th>4</th>\n",
296+
" <td>133</td>\n",
297+
" <td>47.666454</td>\n",
298+
" <td>-122.311628</td>\n",
299+
" <td>2016-09-08T03:56:08</td>\n",
300+
" <td>MTY2ODg4MTAzMTczMDE0MTUwNTM=</td>\n",
301+
" <td>6093392707709829100</td>\n",
302+
" <td>6093392707776938000</td>\n",
303+
" <td>6093392708045373400</td>\n",
304+
" <td>6093392711266598900</td>\n",
305+
" <td>6093392715561566200</td>\n",
306+
" <td>...</td>\n",
307+
" <td>False</td>\n",
308+
" <td>False</td>\n",
309+
" <td>False</td>\n",
310+
" <td>False</td>\n",
311+
" <td>False</td>\n",
312+
" <td>False</td>\n",
313+
" <td>False</td>\n",
314+
" <td>False</td>\n",
315+
" <td>False</td>\n",
316+
" <td>133</td>\n",
317+
" </tr>\n",
318+
" </tbody>\n",
319+
"</table>\n",
320+
"<p>5 rows × 208 columns</p>\n",
321+
"</div>"
322+
],
323+
"text/plain": [
324+
" pokemonId latitude longitude appearedLocalTime \\\n",
325+
"0 16 20.525745 -97.460829 2016-09-08T03:57:45 \n",
326+
"1 133 20.523695 -97.461167 2016-09-08T03:57:37 \n",
327+
"2 16 38.903590 -77.199780 2016-09-08T03:57:25 \n",
328+
"3 13 47.665903 -122.312561 2016-09-08T03:56:22 \n",
329+
"4 133 47.666454 -122.311628 2016-09-08T03:56:08 \n",
330+
"\n",
331+
" _id cellId_90m cellId_180m \\\n",
332+
"0 NTgxMDkzOTk4MTM5MjUwMjIzNw== 9645139108510564000 9645139108711890000 \n",
333+
"1 OTQ1NDgzODc1MjM3NDEzMTI2MQ== 9645139109852742000 9645139109785633000 \n",
334+
"2 NTQ0OTQ0NDA1Nzg2ODg3OTg2OQ== 9923201472785285000 9923201472986612000 \n",
335+
"3 NTU2MTU1NDM4NzA2MDk1MDcxNw== 6093392705025474600 6093392705092583400 \n",
336+
"4 MTY2ODg4MTAzMTczMDE0MTUwNTM= 6093392707709829100 6093392707776938000 \n",
337+
"\n",
338+
" cellId_370m cellId_730m cellId_1460m ... \\\n",
339+
"0 9645139108443455000 9645139109517197000 9645139113812165000 ... \n",
340+
"1 9645139110590940000 9645139109517197000 9645139113812165000 ... \n",
341+
"2 9923201473791918000 9923201477013144000 9923201481308110000 ... \n",
342+
"3 6093392705897889800 6093392702676664300 6093392715561566200 ... \n",
343+
"4 6093392708045373400 6093392711266598900 6093392715561566200 ... \n",
344+
"\n",
345+
" cooc_143 cooc_144 cooc_145 cooc_146 cooc_147 cooc_148 cooc_149 \\\n",
346+
"0 False False False False False False False \n",
347+
"1 False False False False False False False \n",
348+
"2 False False False False False False False \n",
349+
"3 False False False False False False False \n",
350+
"4 False False False False False False False \n",
351+
"\n",
352+
" cooc_150 cooc_151 target \n",
353+
"0 False False 16 \n",
354+
"1 False False 133 \n",
355+
"2 False False 16 \n",
356+
"3 False False 13 \n",
357+
"4 False False 133 \n",
358+
"\n",
359+
"[5 rows x 208 columns]"
360+
]
361+
},
362+
"execution_count": 31,
363+
"metadata": {},
364+
"output_type": "execute_result"
365+
}
366+
],
125367
"source": [
126368
"df.head()"
127369
]
@@ -148,7 +390,7 @@
148390
},
149391
{
150392
"cell_type": "code",
151-
"execution_count": null,
393+
"execution_count": 32,
152394
"metadata": {
153395
"collapsed": true,
154396
"deletable": true,
@@ -164,34 +406,58 @@
164406
},
165407
{
166408
"cell_type": "code",
167-
"execution_count": null,
409+
"execution_count": 34,
168410
"metadata": {
169411
"collapsed": false,
170412
"deletable": true,
171413
"editable": true,
172414
"scrolled": true
173415
},
174-
"outputs": [],
416+
"outputs": [
417+
{
418+
"name": "stdout",
419+
"output_type": "stream",
420+
"text": [
421+
"CPU times: user 9.55 s, sys: 562 ms, total: 10.1 s\n",
422+
"Wall time: 10.3 s\n"
423+
]
424+
}
425+
],
175426
"source": [
176427
"%%time\n",
177-
"complib = 'blosc:zstd'\n",
428+
"complib, codec = 'blosc', 'zstd'\n",
178429
"complevel = 6\n",
179-
"filename = \"%s/%s-%d.h5\" % (data_dir, complib, complevel)\n",
430+
"filename = \"%s/%s-%s-%d.h5\" % (data_dir, complib, codec, complevel)\n",
180431
"with pd.HDFStore(filename, mode='w') as hdf:\n",
181432
" # We only index the columns needed\n",
182433
" hdf.put(key='pokemons', value=df, data_columns=['target', 'latitude', 'longitude'],\n",
183-
" format='table', complevel=complevel, complib=complib)"
434+
" format='table', complevel=complevel, complib=\"%s:%s\" % (complib, codec))"
184435
]
185436
},
186437
{
187438
"cell_type": "code",
188-
"execution_count": null,
439+
"execution_count": 35,
189440
"metadata": {
190441
"collapsed": false,
191442
"deletable": true,
192443
"editable": true
193444
},
194-
"outputs": [],
445+
"outputs": [
446+
{
447+
"name": "stdout",
448+
"output_type": "stream",
449+
"text": [
450+
"hdfstore:\r\n",
451+
"total 70592\r\n",
452+
"-rw-r--r-- 1 faltet staff 34M May 18 13:27 blosc-zstd-6.h5\r\n",
453+
"\r\n",
454+
"pokemon:\r\n",
455+
"total 77016\r\n",
456+
"-rw-r--r-- 1 faltet staff 38M May 17 12:28 300k_csv.zip\r\n",
457+
"drwxr-xr-x 153 faltet staff 5.1K May 17 12:28 \u001b[34msprites\u001b[m\u001b[m/\r\n"
458+
]
459+
}
460+
],
195461
"source": [
196462
"%ls -lh {data_dir} pokemon"
197463
]

0 commit comments

Comments
 (0)