|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "markdown", |
5 | | - "metadata": {}, |
| 5 | + "metadata": { |
| 6 | + "deletable": true, |
| 7 | + "editable": true |
| 8 | + }, |
6 | 9 | "source": [ |
7 | 10 | "# Structuring Datasets" |
8 | 11 | ] |
|
281 | 284 | }, |
282 | 285 | { |
283 | 286 | "cell_type": "code", |
284 | | - "execution_count": 9, |
| 287 | + "execution_count": 1, |
285 | 288 | "metadata": { |
286 | | - "collapsed": true |
| 289 | + "collapsed": true, |
| 290 | + "deletable": true, |
| 291 | + "editable": true |
287 | 292 | }, |
288 | 293 | "outputs": [], |
289 | 294 | "source": [ |
|
295 | 300 | }, |
296 | 301 | { |
297 | 302 | "cell_type": "code", |
298 | | - "execution_count": 10, |
| 303 | + "execution_count": 4, |
299 | 304 | "metadata": { |
300 | | - "collapsed": true |
| 305 | + "collapsed": true, |
| 306 | + "deletable": true, |
| 307 | + "editable": true |
301 | 308 | }, |
302 | 309 | "outputs": [], |
303 | 310 | "source": [ |
304 | 311 | "# Import CSV files via pandas\n", |
305 | 312 | "dset = 'movielens-1m'\n", |
306 | 313 | "fdata = os.path.join(dset, 'ratings.dat.gz')\n", |
307 | | - "fitem = os.path.join(dset, 'movies.dat')\n", |
| 314 | + "fitem = os.path.join(dset, 'movies.dat.gz')\n", |
308 | 315 | "\n", |
309 | 316 | "# pass in column names for each CSV\n", |
310 | 317 | "r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']\n", |
311 | | - "ratings = pd.read_csv(fdata, sep=';', names=r_cols, compression='gzip')\n", |
| 318 | + "ratings = pd.read_csv(fdata, sep=';', names=r_cols)\n", |
312 | 319 | "\n", |
313 | 320 | "m_cols = ['movie_id', 'title', 'genres']\n", |
314 | 321 | "movies = pd.read_csv(fitem, sep=';', names=m_cols,\n", |
|
317 | 324 | }, |
318 | 325 | { |
319 | 326 | "cell_type": "code", |
320 | | - "execution_count": 11, |
| 327 | + "execution_count": 5, |
321 | 328 | "metadata": { |
322 | | - "collapsed": true |
| 329 | + "collapsed": true, |
| 330 | + "deletable": true, |
| 331 | + "editable": true |
323 | 332 | }, |
324 | 333 | "outputs": [], |
325 | 334 | "source": [ |
|
329 | 338 | }, |
330 | 339 | { |
331 | 340 | "cell_type": "code", |
332 | | - "execution_count": 12, |
| 341 | + "execution_count": 6, |
333 | 342 | "metadata": { |
334 | | - "collapsed": false |
| 343 | + "collapsed": false, |
| 344 | + "deletable": true, |
| 345 | + "editable": true |
335 | 346 | }, |
336 | 347 | "outputs": [ |
337 | 348 | { |
|
346 | 357 | "dtype: object" |
347 | 358 | ] |
348 | 359 | }, |
349 | | - "execution_count": 12, |
| 360 | + "execution_count": 6, |
350 | 361 | "metadata": {}, |
351 | 362 | "output_type": "execute_result" |
352 | 363 | } |
|
359 | 370 | "cell_type": "code", |
360 | 371 | "execution_count": 13, |
361 | 372 | "metadata": { |
362 | | - "collapsed": true |
| 373 | + "collapsed": true, |
| 374 | + "deletable": true, |
| 375 | + "editable": true |
363 | 376 | }, |
364 | 377 | "outputs": [], |
365 | 378 | "source": [ |
|
393 | 406 | "cell_type": "code", |
394 | 407 | "execution_count": 14, |
395 | 408 | "metadata": { |
396 | | - "collapsed": false |
| 409 | + "collapsed": false, |
| 410 | + "deletable": true, |
| 411 | + "editable": true |
397 | 412 | }, |
398 | 413 | "outputs": [ |
399 | 414 | { |
|
416 | 431 | "cell_type": "code", |
417 | 432 | "execution_count": 15, |
418 | 433 | "metadata": { |
419 | | - "collapsed": false |
| 434 | + "collapsed": false, |
| 435 | + "deletable": true, |
| 436 | + "editable": true |
420 | 437 | }, |
421 | 438 | "outputs": [ |
422 | 439 | { |
|
438 | 455 | "cell_type": "code", |
439 | 456 | "execution_count": 16, |
440 | 457 | "metadata": { |
441 | | - "collapsed": false |
| 458 | + "collapsed": false, |
| 459 | + "deletable": true, |
| 460 | + "editable": true |
442 | 461 | }, |
443 | 462 | "outputs": [ |
444 | 463 | { |
|
477 | 496 | "cell_type": "code", |
478 | 497 | "execution_count": 17, |
479 | 498 | "metadata": { |
480 | | - "collapsed": false |
| 499 | + "collapsed": false, |
| 500 | + "deletable": true, |
| 501 | + "editable": true |
481 | 502 | }, |
482 | 503 | "outputs": [ |
483 | 504 | { |
|
508 | 529 | }, |
509 | 530 | { |
510 | 531 | "cell_type": "markdown", |
511 | | - "metadata": {}, |
| 532 | + "metadata": { |
| 533 | + "deletable": true, |
| 534 | + "editable": true |
| 535 | + }, |
512 | 536 | "source": [ |
513 | 537 | "As can be seen, the size of the denormalized table is much larger than the normalized one (156 MB vs 17 MB). But that is without using compression." |
514 | 538 | ] |
515 | 539 | }, |
516 | 540 | { |
517 | 541 | "cell_type": "markdown", |
518 | | - "metadata": {}, |
| 542 | + "metadata": { |
| 543 | + "deletable": true, |
| 544 | + "editable": true |
| 545 | + }, |
519 | 546 | "source": [ |
520 | 547 | "### Exercise 1\n", |
521 | 548 | "\n", |
|
595 | 622 | }, |
596 | 623 | { |
597 | 624 | "cell_type": "markdown", |
598 | | - "metadata": {}, |
| 625 | + "metadata": { |
| 626 | + "deletable": true, |
| 627 | + "editable": true |
| 628 | + }, |
599 | 629 | "source": [ |
600 | 630 | "### Exercise 2\n", |
601 | 631 | "\n", |
|
689 | 719 | }, |
690 | 720 | { |
691 | 721 | "cell_type": "markdown", |
692 | | - "metadata": {}, |
| 722 | + "metadata": { |
| 723 | + "deletable": true, |
| 724 | + "editable": true |
| 725 | + }, |
693 | 726 | "source": [ |
694 | 727 | "In the next section we will see the effect of querying normalized and denormalized tables." |
695 | 728 | ] |
|
698 | 731 | "cell_type": "code", |
699 | 732 | "execution_count": null, |
700 | 733 | "metadata": { |
701 | | - "collapsed": true |
| 734 | + "collapsed": true, |
| 735 | + "deletable": true, |
| 736 | + "editable": true |
702 | 737 | }, |
703 | 738 | "outputs": [], |
704 | 739 | "source": [] |
|
0 commit comments