imdb_scraping/imdb_scraping.py at master · Aamir-Raza/imdb_scraping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python

""" Scraping IMDb Credits

Retrieve listing information for the given IMDb crew link
in the form of a dictionary with the title, role and release year
of each entry.

Example usage: python imdb_scraping.py nm6553571
"""

__author__ = "Aamir Raza"

import sys
import re
from collections import defaultdict
from pprint import pprint

import requests                                # pylint: disable=import-error
from bs4 import BeautifulSoup, SoupStrainer    # pylint: disable=import-error

def process_page(page_id):
    """
    Retrieve listing information for the given IMDb crew link
    in the form of a dictionary with the title, role and release year
    of each entry.

    returns defaultdict in the format title: Name, Year, Info, Category

    'tt2802850': ['Fargo',
                           '2017',
                           '(TV Series) (digital compositor - 1 episode)',
                           'Visual effects'],
    'tt3065204': ['The Conjuring 2',
                           '2016',
                           '(digital compositor)',
                           'Visual effects']
    """

    user_data = defaultdict(list)

    url = 'https://www.imdb.com/name/{}/'.format(page_id)
    print(url)

    page = requests.get(url)

    print("\nRequest Status Code: {}".format(page.status_code))

    # If return code for request is successful
    if page.status_code == 200:

        print("Collecting data...\n")

        # SoupStrainer used to narrow down the page to the filmography section
        filmo_filter = SoupStrainer('div', id="filmography")
        soup = BeautifulSoup(page.text, "html.parser", parse_only=filmo_filter)

        # Use regex to only parse through credit rows
        regex = re.compile('^filmo-row (?:odd|even)')

        # Go through each entry and find the title, year and role
        for entry in soup.find_all('div', class_=regex):

            # Get category for entry
            category = entry.parent.previous_sibling.previous_sibling.find('a')
            category = category.text.strip()

            # Title ID for production
            entry_id = re.search('tt[0-9]{6,10}', entry['id'])
            if entry_id[0] is not None:
                entry_id = entry_id[0]
            else:
                entry_id = "none"

            # Title name for production
            title = entry.find('b')
            entry_title = title.text.strip()

            # Year of production
            year = entry.find('span', class_="year_column")
            entry_year = year.text.strip()

            # Info for entry (sometimes contains role)
            info = entry.find('b').next_sibling.strip()
            if info.endswith('('):
                info = info[:-1].strip()

            # Add each entry to defaultdict
            user_data[entry_id] = [entry_title, entry_year, info, category]

        pprint(user_data)

    return user_data

if __name__ == "__main__":

    # Check if IMDb page id appears valid (not foolproof)
    if (len(sys.argv) > 1 and len(sys.argv[1]) >= 8 and
        str.isdigit(sys.argv[1][2:]) and sys.argv[1][:2] == "nm"):
        process_page(sys.argv[1])
    else:
        print("Not a valid IMDb page")