Skip to content

Commit 786f4cc

Browse files
committed
Compress all the datafiles for movielens
1 parent bce07e7 commit 786f4cc

6 files changed

Lines changed: 68 additions & 9954 deletions

File tree

3-Using-Compression.ipynb

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
},
2323
{
2424
"cell_type": "code",
25-
"execution_count": 1,
25+
"execution_count": 2,
2626
"metadata": {
2727
"collapsed": true,
2828
"deletable": true,
@@ -38,7 +38,7 @@
3838
},
3939
{
4040
"cell_type": "code",
41-
"execution_count": 2,
41+
"execution_count": 8,
4242
"metadata": {
4343
"collapsed": false,
4444
"deletable": true,
@@ -49,11 +49,11 @@
4949
"# Import CSV files via pandas\n",
5050
"dset = 'movielens-1m'\n",
5151
"fdata = os.path.join(dset, 'ratings.dat.gz')\n",
52-
"fitem = os.path.join(dset, 'movies.dat')\n",
52+
"fitem = os.path.join(dset, 'movies.dat.gz')\n",
5353
"\n",
5454
"# pass in column names for each CSV\n",
5555
"r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']\n",
56-
"ratings = pd.read_csv(fdata, sep=';', names=r_cols, compression='gzip')\n",
56+
"ratings = pd.read_csv(fdata, sep=';', names=r_cols)\n",
5757
"\n",
5858
"m_cols = ['movie_id', 'title', 'genres']\n",
5959
"movies = pd.read_csv(fitem, sep=';', names=m_cols,\n",
@@ -62,7 +62,7 @@
6262
},
6363
{
6464
"cell_type": "code",
65-
"execution_count": 3,
65+
"execution_count": 4,
6666
"metadata": {
6767
"collapsed": false,
6868
"deletable": true,
@@ -78,7 +78,7 @@
7878
"dtype: object"
7979
]
8080
},
81-
"execution_count": 3,
81+
"execution_count": 4,
8282
"metadata": {},
8383
"output_type": "execute_result"
8484
}
@@ -89,7 +89,7 @@
8989
},
9090
{
9191
"cell_type": "code",
92-
"execution_count": 4,
92+
"execution_count": 5,
9393
"metadata": {
9494
"collapsed": false,
9595
"deletable": true,
@@ -106,7 +106,7 @@
106106
"dtype: object"
107107
]
108108
},
109-
"execution_count": 4,
109+
"execution_count": 5,
110110
"metadata": {},
111111
"output_type": "execute_result"
112112
}
@@ -345,7 +345,9 @@
345345
"cell_type": "code",
346346
"execution_count": 9,
347347
"metadata": {
348-
"collapsed": false
348+
"collapsed": false,
349+
"deletable": true,
350+
"editable": true
349351
},
350352
"outputs": [
351353
{

4-Structuring-Datasets.ipynb

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
"cells": [
33
{
44
"cell_type": "markdown",
5-
"metadata": {},
5+
"metadata": {
6+
"deletable": true,
7+
"editable": true
8+
},
69
"source": [
710
"# Structuring Datasets"
811
]
@@ -281,9 +284,11 @@
281284
},
282285
{
283286
"cell_type": "code",
284-
"execution_count": 9,
287+
"execution_count": 1,
285288
"metadata": {
286-
"collapsed": true
289+
"collapsed": true,
290+
"deletable": true,
291+
"editable": true
287292
},
288293
"outputs": [],
289294
"source": [
@@ -295,20 +300,22 @@
295300
},
296301
{
297302
"cell_type": "code",
298-
"execution_count": 10,
303+
"execution_count": 4,
299304
"metadata": {
300-
"collapsed": true
305+
"collapsed": true,
306+
"deletable": true,
307+
"editable": true
301308
},
302309
"outputs": [],
303310
"source": [
304311
"# Import CSV files via pandas\n",
305312
"dset = 'movielens-1m'\n",
306313
"fdata = os.path.join(dset, 'ratings.dat.gz')\n",
307-
"fitem = os.path.join(dset, 'movies.dat')\n",
314+
"fitem = os.path.join(dset, 'movies.dat.gz')\n",
308315
"\n",
309316
"# pass in column names for each CSV\n",
310317
"r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']\n",
311-
"ratings = pd.read_csv(fdata, sep=';', names=r_cols, compression='gzip')\n",
318+
"ratings = pd.read_csv(fdata, sep=';', names=r_cols)\n",
312319
"\n",
313320
"m_cols = ['movie_id', 'title', 'genres']\n",
314321
"movies = pd.read_csv(fitem, sep=';', names=m_cols,\n",
@@ -317,9 +324,11 @@
317324
},
318325
{
319326
"cell_type": "code",
320-
"execution_count": 11,
327+
"execution_count": 5,
321328
"metadata": {
322-
"collapsed": true
329+
"collapsed": true,
330+
"deletable": true,
331+
"editable": true
323332
},
324333
"outputs": [],
325334
"source": [
@@ -329,9 +338,11 @@
329338
},
330339
{
331340
"cell_type": "code",
332-
"execution_count": 12,
341+
"execution_count": 6,
333342
"metadata": {
334-
"collapsed": false
343+
"collapsed": false,
344+
"deletable": true,
345+
"editable": true
335346
},
336347
"outputs": [
337348
{
@@ -346,7 +357,7 @@
346357
"dtype: object"
347358
]
348359
},
349-
"execution_count": 12,
360+
"execution_count": 6,
350361
"metadata": {},
351362
"output_type": "execute_result"
352363
}
@@ -359,7 +370,9 @@
359370
"cell_type": "code",
360371
"execution_count": 13,
361372
"metadata": {
362-
"collapsed": true
373+
"collapsed": true,
374+
"deletable": true,
375+
"editable": true
363376
},
364377
"outputs": [],
365378
"source": [
@@ -393,7 +406,9 @@
393406
"cell_type": "code",
394407
"execution_count": 14,
395408
"metadata": {
396-
"collapsed": false
409+
"collapsed": false,
410+
"deletable": true,
411+
"editable": true
397412
},
398413
"outputs": [
399414
{
@@ -416,7 +431,9 @@
416431
"cell_type": "code",
417432
"execution_count": 15,
418433
"metadata": {
419-
"collapsed": false
434+
"collapsed": false,
435+
"deletable": true,
436+
"editable": true
420437
},
421438
"outputs": [
422439
{
@@ -438,7 +455,9 @@
438455
"cell_type": "code",
439456
"execution_count": 16,
440457
"metadata": {
441-
"collapsed": false
458+
"collapsed": false,
459+
"deletable": true,
460+
"editable": true
442461
},
443462
"outputs": [
444463
{
@@ -477,7 +496,9 @@
477496
"cell_type": "code",
478497
"execution_count": 17,
479498
"metadata": {
480-
"collapsed": false
499+
"collapsed": false,
500+
"deletable": true,
501+
"editable": true
481502
},
482503
"outputs": [
483504
{
@@ -508,14 +529,20 @@
508529
},
509530
{
510531
"cell_type": "markdown",
511-
"metadata": {},
532+
"metadata": {
533+
"deletable": true,
534+
"editable": true
535+
},
512536
"source": [
513537
"As can be seen, the size of the denormalized table is much larger than the normalized one (156 MB vs 17 MB). But that is without using compression."
514538
]
515539
},
516540
{
517541
"cell_type": "markdown",
518-
"metadata": {},
542+
"metadata": {
543+
"deletable": true,
544+
"editable": true
545+
},
519546
"source": [
520547
"### Exercise 1\n",
521548
"\n",
@@ -595,7 +622,10 @@
595622
},
596623
{
597624
"cell_type": "markdown",
598-
"metadata": {},
625+
"metadata": {
626+
"deletable": true,
627+
"editable": true
628+
},
599629
"source": [
600630
"### Exercise 2\n",
601631
"\n",
@@ -689,7 +719,10 @@
689719
},
690720
{
691721
"cell_type": "markdown",
692-
"metadata": {},
722+
"metadata": {
723+
"deletable": true,
724+
"editable": true
725+
},
693726
"source": [
694727
"In the next section we will see the effect of querying normalized and denormalized tables."
695728
]
@@ -698,7 +731,9 @@
698731
"cell_type": "code",
699732
"execution_count": null,
700733
"metadata": {
701-
"collapsed": true
734+
"collapsed": true,
735+
"deletable": true,
736+
"editable": true
702737
},
703738
"outputs": [],
704739
"source": []

0 commit comments

Comments
 (0)