FakeNewsDataAnalysis/process_raw_data.py at main · nirdslab/FakeNewsDataAnalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
import os

raw_data_path = r"Data/RawData"
processed_eye_movement_data_path = r"Data/ProcessedEyeMovementData"


def read_raw_data_file(file_path):
    print("Reading the file at " + str(file_path))
    df = pd.read_csv(file_path, sep='\t')
    return df


def filter_eye_movement_data(df):
    df = df[df.Sensor == 'Eye Tracker']
    return df


def remove_invalid_data(df):
    # Validity left/ Validity right: Indicates the confidence level that the left/ right eye has been correctly
    # identified. The available values are valid and invalid
    # Get indexes where Validity left and Validity right columns have "Invalid" values
    invalid_data_index = df[(df['Validity left'] == 'Invalid') & (df['Validity right'] == 'Invalid')].index
    # Delete these row indexes from dataFrame
    df.drop(invalid_data_index, inplace=True)

    df = df.dropna(axis=0, how="all", thresh=None, subset=["Gaze point right Y", "Gaze point left Y"], inplace=False)
    df = df.dropna(axis=0, how="all", thresh=None, subset=["Gaze point right X", "Gaze point left X"], inplace=False)

    df = df.dropna(axis=0, how="all", thresh=None, subset=["Pupil diameter left", "Pupil diameter right"],
                   inplace=False)

    return df


def replace_comma(x):
    return str(x).replace(",", ".")


# Since the comma is used as a decimal separator, update it with dot
def update_decimal_seperator(df):
    df["Pupil diameter left"] = df["Pupil diameter left"].apply(replace_comma)
    df["Pupil diameter right"] = df["Pupil diameter right"].apply(replace_comma)
    return df


def process_invalid_eye_movement_data(df):
    df['Gaze point left X'] = df['Gaze point left X'].fillna(0)
    df['Gaze point right X'] = df['Gaze point right X'].fillna(0)
    df['Gaze point left Y'] = df['Gaze point left Y'].fillna(0)
    df['Gaze point right Y'] = df['Gaze point right Y'].fillna(0)

    df['Pupil diameter left'] = df['Pupil diameter left'].fillna(0)
    df['Pupil diameter right'] = df['Pupil diameter right'].fillna(0)

    # The implementation is done with the assumption that X and Y gaze points are higher than the
    # resolution are 1920 and 1080
    df.loc[df['Gaze point left X'] > 1920, 'Gaze point left X'] = 1920
    df.loc[df['Gaze point right X'] > 1920, 'Gaze point right X'] = 1920

    df.loc[df['Gaze point left X'] < 0, 'Gaze point left X'] = 0
    df.loc[df['Gaze point right X'] < 0, 'Gaze point right X'] = 0

    df.loc[df['Gaze point left Y'] > 1080, 'Gaze point left Y'] = 1080
    df.loc[df['Gaze point right Y'] > 1080, 'Gaze point right Y'] = 1080

    df.loc[df['Gaze point left Y'] < 0, 'Gaze point left Y'] = 0
    df.loc[df['Gaze point right Y'] < 0, 'Gaze point right Y'] = 0

    return df


def split_data_by_presented_stimulus(df, file):
    stimuli = ['Stimulus1', 'Stimulus2', 'Stimulus3', 'Stimulus4', 'Stimulus5', 'Stimulus6', 'Stimulus7', 'Stimulus8',
               'Stimulus9', 'Stimulus10', 'Stimulus11', 'Stimulus12', 'Stimulus13', 'Stimulus14', 'Stimulus15',
               'Stimulus16', 'Stimulus17', 'Stimulus18', 'Stimulus19', 'Stimulus20', 'Stimulus21', 'Stimulus22',
               'Stimulus23', 'Stimulus24', 'Stimulus25', 'Stimulus26', 'Stimulus27', 'Stimulus28', 'Stimulus29',
               'Stimulus30', 'Stimulus31', 'Stimulus32', 'Stimulus33', 'Stimulus34', 'Stimulus35', 'Stimulus36',
               'Stimulus37', 'Stimulus38', 'Stimulus39', 'Stimulus40', 'Stimulus41', 'Stimulus42', 'Stimulus43',
               'Stimulus44', 'Stimulus45', 'Stimulus46', 'Stimulus47', 'Stimulus48', 'Stimulus49', 'Stimulus50',
               'Stimulus51', 'Stimulus52', 'Stimulus53', 'Stimulus54', 'Stimulus55', 'Stimulus56', 'Stimulus57',
               'Stimulus58', 'Stimulus59', 'Stimulus60']

    # To get eye movement data for all the presented stimulus
    # stimuli = df['Presented Stimulus name'].unique().tolist()

    for stimulus in stimuli:
        df_stimulus = df[df['Presented Stimulus name'] == stimulus]
        df_stimulus = df_stimulus[['Gaze point left X', 'Gaze point right Y', 'Gaze point left Y',
                                   'Gaze point right X', 'Pupil diameter left', 'Pupil diameter right',
                                   'Recording timestamp']]
        file_split = file.split("_")
        output_filename = "P" + file_split[0] + "_" + str(stimulus) + ".csv"
        print("Writing the file: " + str(output_filename))
        df_stimulus.to_csv(f"{processed_eye_movement_data_path}/{output_filename}")


if __name__ == "__main__":
    os.chdir(raw_data_path)
    filenames = os.listdir()
    os.chdir("../../")

    for file in filenames:
        if file.endswith(".tsv"):
            file_path = f"{raw_data_path}/{file}"
            df = read_raw_data_file(file_path)
            df = filter_eye_movement_data(df)
            df = remove_invalid_data(df)
            df = process_invalid_eye_movement_data(df)
            df = update_decimal_seperator(df)
            split_data_by_presented_stimulus(df, file)