|
35 | 35 | }, |
36 | 36 | { |
37 | 37 | "cell_type": "code", |
38 | | - "execution_count": null, |
| 38 | + "execution_count": 27, |
39 | 39 | "metadata": { |
40 | 40 | "collapsed": false, |
41 | 41 | "deletable": true, |
|
49 | 49 | }, |
50 | 50 | { |
51 | 51 | "cell_type": "code", |
52 | | - "execution_count": null, |
| 52 | + "execution_count": 28, |
53 | 53 | "metadata": { |
54 | 54 | "collapsed": false, |
55 | 55 | "deletable": true, |
|
82 | 82 | }, |
83 | 83 | { |
84 | 84 | "cell_type": "code", |
85 | | - "execution_count": null, |
| 85 | + "execution_count": 29, |
86 | 86 | "metadata": { |
87 | 87 | "collapsed": false, |
88 | 88 | "deletable": true, |
89 | 89 | "editable": true |
90 | 90 | }, |
91 | | - "outputs": [], |
| 91 | + "outputs": [ |
| 92 | + { |
| 93 | + "name": "stderr", |
| 94 | + "output_type": "stream", |
| 95 | + "text": [ |
| 96 | + "<string>:2: DtypeWarning: Columns (49) have mixed types. Specify dtype option on import or set low_memory=False.\n" |
| 97 | + ] |
| 98 | + }, |
| 99 | + { |
| 100 | + "name": "stdout", |
| 101 | + "output_type": "stream", |
| 102 | + "text": [ |
| 103 | + "CPU times: user 15.5 s, sys: 1.03 s, total: 16.5 s\n", |
| 104 | + "Wall time: 16.7 s\n" |
| 105 | + ] |
| 106 | + } |
| 107 | + ], |
92 | 108 | "source": [ |
93 | 109 | "%%time\n", |
94 | 110 | "df = pd.read_csv('pokemon/300k_csv.zip')\n", |
|
101 | 117 | }, |
102 | 118 | { |
103 | 119 | "cell_type": "code", |
104 | | - "execution_count": null, |
| 120 | + "execution_count": 30, |
105 | 121 | "metadata": { |
106 | 122 | "collapsed": false, |
107 | 123 | "deletable": true, |
108 | 124 | "editable": true |
109 | 125 | }, |
110 | | - "outputs": [], |
| 126 | + "outputs": [ |
| 127 | + { |
| 128 | + "name": "stdout", |
| 129 | + "output_type": "stream", |
| 130 | + "text": [ |
| 131 | + "<class 'pandas.core.frame.DataFrame'>\n", |
| 132 | + "RangeIndex: 296021 entries, 0 to 296020\n", |
| 133 | + "Columns: 208 entries, pokemonId to target\n", |
| 134 | + "dtypes: bool(168), float64(8), int64(17), object(8), uint64(7)\n", |
| 135 | + "memory usage: 137.8+ MB\n" |
| 136 | + ] |
| 137 | + } |
| 138 | + ], |
111 | 139 | "source": [ |
112 | 140 | "df.info()" |
113 | 141 | ] |
114 | 142 | }, |
115 | 143 | { |
116 | 144 | "cell_type": "code", |
117 | | - "execution_count": null, |
| 145 | + "execution_count": 31, |
118 | 146 | "metadata": { |
119 | 147 | "collapsed": false, |
120 | 148 | "deletable": true, |
121 | 149 | "editable": true, |
122 | 150 | "scrolled": true |
123 | 151 | }, |
124 | | - "outputs": [], |
| 152 | + "outputs": [ |
| 153 | + { |
| 154 | + "data": { |
| 155 | + "text/html": [ |
| 156 | + "<div>\n", |
| 157 | + "<style>\n", |
| 158 | + " .dataframe thead tr:only-child th {\n", |
| 159 | + " text-align: right;\n", |
| 160 | + " }\n", |
| 161 | + "\n", |
| 162 | + " .dataframe thead th {\n", |
| 163 | + " text-align: left;\n", |
| 164 | + " }\n", |
| 165 | + "\n", |
| 166 | + " .dataframe tbody tr th {\n", |
| 167 | + " vertical-align: top;\n", |
| 168 | + " }\n", |
| 169 | + "</style>\n", |
| 170 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 171 | + " <thead>\n", |
| 172 | + " <tr style=\"text-align: right;\">\n", |
| 173 | + " <th></th>\n", |
| 174 | + " <th>pokemonId</th>\n", |
| 175 | + " <th>latitude</th>\n", |
| 176 | + " <th>longitude</th>\n", |
| 177 | + " <th>appearedLocalTime</th>\n", |
| 178 | + " <th>_id</th>\n", |
| 179 | + " <th>cellId_90m</th>\n", |
| 180 | + " <th>cellId_180m</th>\n", |
| 181 | + " <th>cellId_370m</th>\n", |
| 182 | + " <th>cellId_730m</th>\n", |
| 183 | + " <th>cellId_1460m</th>\n", |
| 184 | + " <th>...</th>\n", |
| 185 | + " <th>cooc_143</th>\n", |
| 186 | + " <th>cooc_144</th>\n", |
| 187 | + " <th>cooc_145</th>\n", |
| 188 | + " <th>cooc_146</th>\n", |
| 189 | + " <th>cooc_147</th>\n", |
| 190 | + " <th>cooc_148</th>\n", |
| 191 | + " <th>cooc_149</th>\n", |
| 192 | + " <th>cooc_150</th>\n", |
| 193 | + " <th>cooc_151</th>\n", |
| 194 | + " <th>target</th>\n", |
| 195 | + " </tr>\n", |
| 196 | + " </thead>\n", |
| 197 | + " <tbody>\n", |
| 198 | + " <tr>\n", |
| 199 | + " <th>0</th>\n", |
| 200 | + " <td>16</td>\n", |
| 201 | + " <td>20.525745</td>\n", |
| 202 | + " <td>-97.460829</td>\n", |
| 203 | + " <td>2016-09-08T03:57:45</td>\n", |
| 204 | + " <td>NTgxMDkzOTk4MTM5MjUwMjIzNw==</td>\n", |
| 205 | + " <td>9645139108510564000</td>\n", |
| 206 | + " <td>9645139108711890000</td>\n", |
| 207 | + " <td>9645139108443455000</td>\n", |
| 208 | + " <td>9645139109517197000</td>\n", |
| 209 | + " <td>9645139113812165000</td>\n", |
| 210 | + " <td>...</td>\n", |
| 211 | + " <td>False</td>\n", |
| 212 | + " <td>False</td>\n", |
| 213 | + " <td>False</td>\n", |
| 214 | + " <td>False</td>\n", |
| 215 | + " <td>False</td>\n", |
| 216 | + " <td>False</td>\n", |
| 217 | + " <td>False</td>\n", |
| 218 | + " <td>False</td>\n", |
| 219 | + " <td>False</td>\n", |
| 220 | + " <td>16</td>\n", |
| 221 | + " </tr>\n", |
| 222 | + " <tr>\n", |
| 223 | + " <th>1</th>\n", |
| 224 | + " <td>133</td>\n", |
| 225 | + " <td>20.523695</td>\n", |
| 226 | + " <td>-97.461167</td>\n", |
| 227 | + " <td>2016-09-08T03:57:37</td>\n", |
| 228 | + " <td>OTQ1NDgzODc1MjM3NDEzMTI2MQ==</td>\n", |
| 229 | + " <td>9645139109852742000</td>\n", |
| 230 | + " <td>9645139109785633000</td>\n", |
| 231 | + " <td>9645139110590940000</td>\n", |
| 232 | + " <td>9645139109517197000</td>\n", |
| 233 | + " <td>9645139113812165000</td>\n", |
| 234 | + " <td>...</td>\n", |
| 235 | + " <td>False</td>\n", |
| 236 | + " <td>False</td>\n", |
| 237 | + " <td>False</td>\n", |
| 238 | + " <td>False</td>\n", |
| 239 | + " <td>False</td>\n", |
| 240 | + " <td>False</td>\n", |
| 241 | + " <td>False</td>\n", |
| 242 | + " <td>False</td>\n", |
| 243 | + " <td>False</td>\n", |
| 244 | + " <td>133</td>\n", |
| 245 | + " </tr>\n", |
| 246 | + " <tr>\n", |
| 247 | + " <th>2</th>\n", |
| 248 | + " <td>16</td>\n", |
| 249 | + " <td>38.903590</td>\n", |
| 250 | + " <td>-77.199780</td>\n", |
| 251 | + " <td>2016-09-08T03:57:25</td>\n", |
| 252 | + " <td>NTQ0OTQ0NDA1Nzg2ODg3OTg2OQ==</td>\n", |
| 253 | + " <td>9923201472785285000</td>\n", |
| 254 | + " <td>9923201472986612000</td>\n", |
| 255 | + " <td>9923201473791918000</td>\n", |
| 256 | + " <td>9923201477013144000</td>\n", |
| 257 | + " <td>9923201481308110000</td>\n", |
| 258 | + " <td>...</td>\n", |
| 259 | + " <td>False</td>\n", |
| 260 | + " <td>False</td>\n", |
| 261 | + " <td>False</td>\n", |
| 262 | + " <td>False</td>\n", |
| 263 | + " <td>False</td>\n", |
| 264 | + " <td>False</td>\n", |
| 265 | + " <td>False</td>\n", |
| 266 | + " <td>False</td>\n", |
| 267 | + " <td>False</td>\n", |
| 268 | + " <td>16</td>\n", |
| 269 | + " </tr>\n", |
| 270 | + " <tr>\n", |
| 271 | + " <th>3</th>\n", |
| 272 | + " <td>13</td>\n", |
| 273 | + " <td>47.665903</td>\n", |
| 274 | + " <td>-122.312561</td>\n", |
| 275 | + " <td>2016-09-08T03:56:22</td>\n", |
| 276 | + " <td>NTU2MTU1NDM4NzA2MDk1MDcxNw==</td>\n", |
| 277 | + " <td>6093392705025474600</td>\n", |
| 278 | + " <td>6093392705092583400</td>\n", |
| 279 | + " <td>6093392705897889800</td>\n", |
| 280 | + " <td>6093392702676664300</td>\n", |
| 281 | + " <td>6093392715561566200</td>\n", |
| 282 | + " <td>...</td>\n", |
| 283 | + " <td>False</td>\n", |
| 284 | + " <td>False</td>\n", |
| 285 | + " <td>False</td>\n", |
| 286 | + " <td>False</td>\n", |
| 287 | + " <td>False</td>\n", |
| 288 | + " <td>False</td>\n", |
| 289 | + " <td>False</td>\n", |
| 290 | + " <td>False</td>\n", |
| 291 | + " <td>False</td>\n", |
| 292 | + " <td>13</td>\n", |
| 293 | + " </tr>\n", |
| 294 | + " <tr>\n", |
| 295 | + " <th>4</th>\n", |
| 296 | + " <td>133</td>\n", |
| 297 | + " <td>47.666454</td>\n", |
| 298 | + " <td>-122.311628</td>\n", |
| 299 | + " <td>2016-09-08T03:56:08</td>\n", |
| 300 | + " <td>MTY2ODg4MTAzMTczMDE0MTUwNTM=</td>\n", |
| 301 | + " <td>6093392707709829100</td>\n", |
| 302 | + " <td>6093392707776938000</td>\n", |
| 303 | + " <td>6093392708045373400</td>\n", |
| 304 | + " <td>6093392711266598900</td>\n", |
| 305 | + " <td>6093392715561566200</td>\n", |
| 306 | + " <td>...</td>\n", |
| 307 | + " <td>False</td>\n", |
| 308 | + " <td>False</td>\n", |
| 309 | + " <td>False</td>\n", |
| 310 | + " <td>False</td>\n", |
| 311 | + " <td>False</td>\n", |
| 312 | + " <td>False</td>\n", |
| 313 | + " <td>False</td>\n", |
| 314 | + " <td>False</td>\n", |
| 315 | + " <td>False</td>\n", |
| 316 | + " <td>133</td>\n", |
| 317 | + " </tr>\n", |
| 318 | + " </tbody>\n", |
| 319 | + "</table>\n", |
| 320 | + "<p>5 rows × 208 columns</p>\n", |
| 321 | + "</div>" |
| 322 | + ], |
| 323 | + "text/plain": [ |
| 324 | + " pokemonId latitude longitude appearedLocalTime \\\n", |
| 325 | + "0 16 20.525745 -97.460829 2016-09-08T03:57:45 \n", |
| 326 | + "1 133 20.523695 -97.461167 2016-09-08T03:57:37 \n", |
| 327 | + "2 16 38.903590 -77.199780 2016-09-08T03:57:25 \n", |
| 328 | + "3 13 47.665903 -122.312561 2016-09-08T03:56:22 \n", |
| 329 | + "4 133 47.666454 -122.311628 2016-09-08T03:56:08 \n", |
| 330 | + "\n", |
| 331 | + " _id cellId_90m cellId_180m \\\n", |
| 332 | + "0 NTgxMDkzOTk4MTM5MjUwMjIzNw== 9645139108510564000 9645139108711890000 \n", |
| 333 | + "1 OTQ1NDgzODc1MjM3NDEzMTI2MQ== 9645139109852742000 9645139109785633000 \n", |
| 334 | + "2 NTQ0OTQ0NDA1Nzg2ODg3OTg2OQ== 9923201472785285000 9923201472986612000 \n", |
| 335 | + "3 NTU2MTU1NDM4NzA2MDk1MDcxNw== 6093392705025474600 6093392705092583400 \n", |
| 336 | + "4 MTY2ODg4MTAzMTczMDE0MTUwNTM= 6093392707709829100 6093392707776938000 \n", |
| 337 | + "\n", |
| 338 | + " cellId_370m cellId_730m cellId_1460m ... \\\n", |
| 339 | + "0 9645139108443455000 9645139109517197000 9645139113812165000 ... \n", |
| 340 | + "1 9645139110590940000 9645139109517197000 9645139113812165000 ... \n", |
| 341 | + "2 9923201473791918000 9923201477013144000 9923201481308110000 ... \n", |
| 342 | + "3 6093392705897889800 6093392702676664300 6093392715561566200 ... \n", |
| 343 | + "4 6093392708045373400 6093392711266598900 6093392715561566200 ... \n", |
| 344 | + "\n", |
| 345 | + " cooc_143 cooc_144 cooc_145 cooc_146 cooc_147 cooc_148 cooc_149 \\\n", |
| 346 | + "0 False False False False False False False \n", |
| 347 | + "1 False False False False False False False \n", |
| 348 | + "2 False False False False False False False \n", |
| 349 | + "3 False False False False False False False \n", |
| 350 | + "4 False False False False False False False \n", |
| 351 | + "\n", |
| 352 | + " cooc_150 cooc_151 target \n", |
| 353 | + "0 False False 16 \n", |
| 354 | + "1 False False 133 \n", |
| 355 | + "2 False False 16 \n", |
| 356 | + "3 False False 13 \n", |
| 357 | + "4 False False 133 \n", |
| 358 | + "\n", |
| 359 | + "[5 rows x 208 columns]" |
| 360 | + ] |
| 361 | + }, |
| 362 | + "execution_count": 31, |
| 363 | + "metadata": {}, |
| 364 | + "output_type": "execute_result" |
| 365 | + } |
| 366 | + ], |
125 | 367 | "source": [ |
126 | 368 | "df.head()" |
127 | 369 | ] |
|
148 | 390 | }, |
149 | 391 | { |
150 | 392 | "cell_type": "code", |
151 | | - "execution_count": null, |
| 393 | + "execution_count": 32, |
152 | 394 | "metadata": { |
153 | 395 | "collapsed": true, |
154 | 396 | "deletable": true, |
|
164 | 406 | }, |
165 | 407 | { |
166 | 408 | "cell_type": "code", |
167 | | - "execution_count": null, |
| 409 | + "execution_count": 34, |
168 | 410 | "metadata": { |
169 | 411 | "collapsed": false, |
170 | 412 | "deletable": true, |
171 | 413 | "editable": true, |
172 | 414 | "scrolled": true |
173 | 415 | }, |
174 | | - "outputs": [], |
| 416 | + "outputs": [ |
| 417 | + { |
| 418 | + "name": "stdout", |
| 419 | + "output_type": "stream", |
| 420 | + "text": [ |
| 421 | + "CPU times: user 9.55 s, sys: 562 ms, total: 10.1 s\n", |
| 422 | + "Wall time: 10.3 s\n" |
| 423 | + ] |
| 424 | + } |
| 425 | + ], |
175 | 426 | "source": [ |
176 | 427 | "%%time\n", |
177 | | - "complib = 'blosc:zstd'\n", |
| 428 | + "complib, codec = 'blosc', 'zstd'\n", |
178 | 429 | "complevel = 6\n", |
179 | | - "filename = \"%s/%s-%d.h5\" % (data_dir, complib, complevel)\n", |
| 430 | + "filename = \"%s/%s-%s-%d.h5\" % (data_dir, complib, codec, complevel)\n", |
180 | 431 | "with pd.HDFStore(filename, mode='w') as hdf:\n", |
181 | 432 | " # We only index the columns needed\n", |
182 | 433 | " hdf.put(key='pokemons', value=df, data_columns=['target', 'latitude', 'longitude'],\n", |
183 | | - " format='table', complevel=complevel, complib=complib)" |
| 434 | + " format='table', complevel=complevel, complib=\"%s:%s\" % (complib, codec))" |
184 | 435 | ] |
185 | 436 | }, |
186 | 437 | { |
187 | 438 | "cell_type": "code", |
188 | | - "execution_count": null, |
| 439 | + "execution_count": 35, |
189 | 440 | "metadata": { |
190 | 441 | "collapsed": false, |
191 | 442 | "deletable": true, |
192 | 443 | "editable": true |
193 | 444 | }, |
194 | | - "outputs": [], |
| 445 | + "outputs": [ |
| 446 | + { |
| 447 | + "name": "stdout", |
| 448 | + "output_type": "stream", |
| 449 | + "text": [ |
| 450 | + "hdfstore:\r\n", |
| 451 | + "total 70592\r\n", |
| 452 | + "-rw-r--r-- 1 faltet staff 34M May 18 13:27 blosc-zstd-6.h5\r\n", |
| 453 | + "\r\n", |
| 454 | + "pokemon:\r\n", |
| 455 | + "total 77016\r\n", |
| 456 | + "-rw-r--r-- 1 faltet staff 38M May 17 12:28 300k_csv.zip\r\n", |
| 457 | + "drwxr-xr-x 153 faltet staff 5.1K May 17 12:28 \u001b[34msprites\u001b[m\u001b[m/\r\n" |
| 458 | + ] |
| 459 | + } |
| 460 | + ], |
195 | 461 | "source": [ |
196 | 462 | "%ls -lh {data_dir} pokemon" |
197 | 463 | ] |
|
0 commit comments