Skip to content

Commit cfc9670

Browse files
authored
initial commit (#1)
1 parent f490484 commit cfc9670

12 files changed

Lines changed: 46079 additions & 0 deletions

.gitignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
4+
# Environment
5+
venv/
6+
.venv/
7+
8+
# Project settings
9+
.idea/

moviedb_analyzer.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#-#!/usr/bin/env python3
2+
from src.facade import Facade
3+
4+
def main():
5+
6+
# Init and load file
7+
8+
INPUT_PATH = "resources/movies_metadata.csv"
9+
INPUT_TYPE = 'csv'
10+
11+
facade = Facade(input_type=INPUT_TYPE,
12+
input_path=INPUT_PATH)
13+
14+
# Analyze
15+
facade.logger.info(f"Unique movies: {facade.count_unique_rows(['imdb_id', 'original_title'])}")
16+
17+
facade.logger.info(f"Average ratings of all movies: {facade.find_average('vote_average')}")
18+
19+
facade.logger.info(f"Top 5 highest rated movies: {facade.find_top(sort_column='vote_average', top=5, return_column='original_title')}")
20+
21+
facade.logger.info(f"Movies released each year: {facade.movies_by_year('release_date')}")
22+
23+
facade.logger.info(f"Movies released in each genre: {facade.movies_by_genre('genres')}")
24+
25+
# Save as JSON
26+
27+
OUTPUT_PATH = "resources/movies_metadata.json"
28+
OUTPUT_TYPE = 'json'
29+
30+
facade.save_as(output_type=OUTPUT_TYPE,
31+
output_path=OUTPUT_PATH)
32+
33+
34+
if __name__ == "__main__":
35+
36+
main()

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pandas==2.2.2

requirements_test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
coverage==7.5.4

resources/movies_metadata.csv

Lines changed: 45573 additions & 0 deletions
Large diffs are not rendered by default.

setup.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from io import open as io_open
2+
from setuptools import setup, find_packages
3+
4+
# read module requirements from requirements.txt instead of repeating th edependencies here:
5+
with open("requirements.txt") as f:
6+
requirements = f.read().splitlines()
7+
install_requires = [r for r in requirements if not r == ""]
8+
9+
setup(
10+
name="moviedb_analyzer",
11+
version="0.1",
12+
description="Pandas analyzer for moviedb files",
13+
long_description=io_open("README.md", encoding="utf-8").read(),
14+
package_dir={"": "src"},
15+
packages=find_packages("src"),
16+
install_requires=install_requires
17+
)

src/analyzer.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
from typing import List
2+
from pandas import DataFrame, to_datetime
3+
4+
from src.logger import Logger
5+
6+
class Analyzer:
7+
"""Analyzer class. Uses pandas dataframe method and functions to provide statistics and aggregations
8+
"""
9+
def __init__(self, dataframe: DataFrame):
10+
"""Init method
11+
12+
Args:
13+
dataframe (DataFrame): Dataframe to be analyzed
14+
"""
15+
self._dataframe = dataframe
16+
self._logger = Logger()
17+
18+
self._logger.debug(f"{self._dataframe.columns=}")
19+
20+
def count_unique_rows(self, columns: List[str]) -> int:
21+
"""Count unique rows
22+
23+
Args:
24+
columns (List[str]): List of columns
25+
26+
Returns:
27+
int: Number of unique rows
28+
"""
29+
count = self._dataframe[columns].drop_duplicates().shape[0]
30+
31+
return count
32+
33+
def find_average(self, column) -> float:
34+
"""Find average based on column
35+
36+
Args:
37+
column (_type_): column to get average of
38+
39+
Returns:
40+
float: the average of the column values
41+
"""
42+
average = self._dataframe[column].mean()
43+
44+
return average
45+
46+
def find_top_rows(self, column: str, top: int) -> DataFrame:
47+
"""Find top N rows
48+
49+
Args:
50+
column (str): column to sort by
51+
top (int): number of top rows to return
52+
53+
Returns:
54+
DataFrame: Dataframe containing the top rows only
55+
"""
56+
top_rows = self._dataframe.sort_values(by=[column], ascending=False).head(top)
57+
58+
return top_rows
59+
60+
def movies_by_year(self, column: str) -> DataFrame:
61+
"""Count movies by year
62+
63+
Args:
64+
column (str): Movies column name
65+
66+
Returns:
67+
DataFrame: Dataframe containing the counts
68+
"""
69+
tmp_df = self._dataframe
70+
71+
tmp_df['year'] = tmp_df[column].str[:4]
72+
73+
self._logger.debug(f"{tmp_df.columns=}")
74+
75+
counts = tmp_df.groupby(['year']).size()
76+
77+
self._logger.debug(f"{type(counts)=}")
78+
79+
return counts
80+
81+
def movies_by_genre(self, column: str) -> DataFrame:
82+
"""Count movies by genre
83+
84+
Args:
85+
column (str): Genre column name
86+
87+
Returns:
88+
DataFrame: Dataframe containing the counts
89+
"""
90+
tmp_df = self._dataframe
91+
92+
tmp_df = tmp_df.explode(column)
93+
94+
self._logger.debug(f"{tmp_df.head(5)=}")
95+
96+
counts = tmp_df.groupby(column).size()
97+
98+
self._logger.debug(f"{type(counts)=}")
99+
100+
return counts
101+

src/facade.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
from typing import List
2+
from pathlib import Path
3+
4+
from src.file_handler import FileHandler
5+
from src.analyzer import Analyzer
6+
from src.logger import Logger
7+
8+
class Facade:
9+
"""Facade class for the app.
10+
"""
11+
12+
def __init__(self, input_type: str, input_path: str):
13+
"""Init method
14+
15+
Args:
16+
input_type (str): Type of input file
17+
input_path (str): Path of input file
18+
"""
19+
self.logger = Logger()
20+
21+
self._file_handler = FileHandler(input_type=input_type,
22+
input_path=input_path)
23+
24+
self._dataframe = self._file_handler.dataframe
25+
26+
self._analyzer = Analyzer(self._dataframe)
27+
28+
def count_unique_rows(self, columns: List[str]) -> int:
29+
"""Count unique rows based on column combination.
30+
31+
Args:
32+
columns (List[str]): List of columns to be used in aggregation.
33+
34+
Returns:
35+
int: Return number of rows
36+
"""
37+
count = self._analyzer.count_unique_rows(columns)
38+
39+
return count
40+
41+
def find_average(self, column: str) -> float:
42+
"""Find average value of values in column
43+
44+
Args:
45+
column (str): column name
46+
47+
Returns:
48+
float: return average
49+
"""
50+
average = self._analyzer.find_average(column)
51+
52+
return average
53+
54+
def find_top(self, sort_column: str, top: int, return_column: str) -> str:
55+
"""Find the top N values of return_column, based on sort_column.
56+
57+
Args:
58+
sort_column (str): Column to sort by
59+
top (int): Number of records to return
60+
return_column (str): Column to return values for
61+
62+
Returns:
63+
str: Top N values for return_column
64+
"""
65+
top_rated_movies = self._analyzer.find_top_rows(column=sort_column, top=top)[return_column]
66+
67+
return top_rated_movies
68+
69+
def movies_by_year(self, column: str) -> str:
70+
"""Count movies by year
71+
72+
Args:
73+
column (str): year column
74+
75+
Returns:
76+
str: Return string representation of dataframe
77+
"""
78+
list_of_count = self._analyzer.movies_by_year(column)#.values.tolist()
79+
80+
return list_of_count.to_string()
81+
82+
def movies_by_genre(self, column: str) -> str:
83+
"""Count movies by genre.
84+
85+
Args:
86+
column (str): genre column.
87+
88+
Returns:
89+
str: Return string representation of dataframe
90+
"""
91+
list_of_count = self._analyzer.movies_by_genre(column)#.values.tolist()
92+
93+
return list_of_count.to_string()
94+
95+
def save_as(self, output_type: str, output_path: str):
96+
"""Save dataframe as file type
97+
98+
Args:
99+
output_type (str): Type of the file to be saved
100+
output_path (str): Path of the file to be saved
101+
"""
102+
self._file_handler.save_df_as_file(output_type=output_type, output_path=output_path)
103+

0 commit comments

Comments
 (0)