Course-Compass/my-app/src/presenters/UploadTranscriptPresenter.jsx at c67402876e9abc8bb6d4a0f3f5097e7937568e2c · InferenceKTH/Course-Compass · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import React from 'react';
import { observer } from "mobx-react-lite";
import * as pdfjsLib from "pdfjs-dist";
import pdfWorker from "pdfjs-dist/build/pdf.worker?url";
import { useState } from "react";
import UploadField from '../views/Components/SideBarComponents/UploadField';

pdfjsLib.GlobalWorkerOptions.workerSrc = pdfWorker;

const UploadTranscriptPresenter = observer(({ model }) => {
    const [errorMessage, setErrorMessage] = useState(""); // Stores error message
    const [errorVisibility, setErrorVisibility] = useState("hidden"); // Controls visibility
    const [fileInputValue, setFileInputValue] = useState(""); // Controls upload field state

    async function transcriptScraperFunction(file) {
        console.log(file);
        //const pdfjsLib = window['pdfjsLib'];
        //pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.worker.min.js';
        if (!file) {
            console.log("element: 'PDF-Scraper-Input' changed, but we havent gotten a file yet.");
            return;
        }
        if (file.type !== "application/pdf") {
            throwTranscriptScraperError("Uploaded file isn't PDF.");
            return;
        }

        setErrorVisibility("hidden");


        const arrayBuffer = await file.arrayBuffer();
        const typedArray = new Uint8Array(arrayBuffer);
        try {
            const pdf = await pdfjsLib.getDocument({ data: typedArray }).promise;
            let extractedText = '';

            //this is our array we are going to work with
            let textObjects = [];


            //we will parse the whole pdf page-by-page, and going to push all the content into our array
            for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
                const page = await pdf.getPage(pageNum);
                const textContent = await page.getTextContent();
                //pushing all the text items from the page into our array
                textObjects.push(...textContent.items);
            }


            evaluatePDFtextObjectArray(textObjects);


            //document.getElementById('transcript-scraper.js:output').textContent = localStorage.getItem("completedCourses") || 'No matching text found.';
        }
        catch (e) {
            throwTranscriptScraperError("While parsing the pdf something went wrong." + e);
        }
    }

    function throwTranscriptScraperError(txt) {
        console.log("PDF-Scraper-Error: " + txt);
        setErrorMessage("Error: " + txt);
        setErrorVisibility("visible");
    }

    function writeLocalStorage_completedCourses(codesArr) {
        //Getting the local storage contents
        let local = [];
        if (localStorage.getItem("completedCourses"))
            local = JSON.parse(localStorage.getItem("completedCourses"));
        else {
            localStorage.setItem("completedCourses", '[]');
        }

        local.sort();

        let newcodes = local.concat(codesArr);
        newcodes = [... new Set(newcodes)];


        localStorage.setItem("completedCourses", JSON.stringify(newcodes));
        console.log(newcodes);

        window.dispatchEvent(new Event("completedCourses changed"));
    }

    function evaluatePDFtextObjectArray(textObjects) {
        let scrapedCodes = [];

        //initializing couple flags.
        let flagKTH = false;
        let flagKTH_NeverSet = true;
        let flagTable = false;
        let flagTableDone = false;

        let flagErrorRecords = false;

        //we are going to go through each text object which is inside the pdf file.
        for (let i = 0; i < textObjects.length; i++) {
            //we are going to look for our university, KTH
            //current ladok generated National Official transcripts start at xposition 56.692
            if ((!flagKTH) && (textObjects[i].transform[4] === 56.692))
                if ((textObjects[i].str == "Kungliga Tekniska högskolan") || (textObjects[i].str == "KTH Royal Institute of Technology")) {
                    flagKTH = true;
                    flagKTH_NeverSet = false;
                    continue;
                }

            if ((!flagErrorRecords) && ((textObjects[i].str == "Resultatintyg") || (textObjects[i].str == "Official Transcript of Records"))) {
                flagErrorRecords = true;
            }

            if (flagKTH) {
                //we have found KTH, the very next table containing records should be the one with completed courses
                //TODO: this might not be necessarily true, you might need to have a similar code to KTH checker, to check if its
                //      'completed courses'/'avslutade kurser'


                //the very first text in a table is always Code/Kod; we will start describing it; and we will detect when a new table starts
                //and check if its accidentally the same table which just got cut in half by a newline or an actually different table
                if ((textObjects[i].str === "Code") || (textObjects[i].str === "Kod")) {
                    if (flagTable) flagTableDone = true; //we have already found one table and transcribed it

                    if (!flagTableDone) {
                        flagTable = true;
                    } else {
                        if (textObjects[i - 2].transform[4] !== 497.66899718999997) {
                            //the new table (that is the new found "Kod" / "Code" is not because unexpected page break, therefore we are done transcribing
                            //KTH courses, these are either uncomplete courses, or courses from other universities
                            flagTable = false;
                            //console.log("----------------------------\nfinished table!");
                            flagKTH = false;
                        }
                    }
                }
                //we are looking for text objects which are precisely at x coord 56.692; and also there exists such an element 12 ahead in the array
                //which is at coord 510.233; these are hardcoded values into the ladok pdf generator
                //for good measures we also make sure the text is not longer that 7 chars; the longest course ID found so far at KTH
                if ((textObjects[i].transform[4] === 56.692) && (textObjects[i + 12].transform[4] === 510.233) && (textObjects[i].str.length < 8))
                    if (flagTable) {
                        //console.log(textObjects[i].str, textObjects[i].transform[4]);
                        //extractedText+= textObjects[i].str + "\n";
                        scrapedCodes.push(textObjects[i].str);
                    }

            }

        }

        if (flagErrorRecords && (scrapedCodes.length == 0)) {
            throwTranscriptScraperError("Provided Official Transcript of Records instead of National Official transcript of records.");
            return;
        }

        if (flagKTH_NeverSet) {
            throwTranscriptScraperError("Provided pdf doesn't contain KTH.");
            return;
        }
        //console.log(scrapedCodes);
        //console.log(localStorage.getItem("completedCourses"));
        if (scrapedCodes.length == 0) {
            throwTranscriptScraperError("Couldn't find any tables to transcribe.");
            return;
        }
        writeLocalStorage_completedCourses(scrapedCodes);
        //console.log(localStorage.getItem("completedCourses"));
    }

    const handleFileChange = (event) => {
        const file = event.target.files[0];
        console.log("vliza");
        //document.getElementById('PDF-Scraper-Error').style.visibility = "visible";
        transcriptScraperFunction(file);
        //document.getElementById('PDF-Scraper-Input').value = '';
        setFileInputValue('');
    };

    return (
        <UploadField
            errorMessage={errorMessage}
            errorVisibility={errorVisibility}
            handleFileChange={handleFileChange}
            fileInputValue = {fileInputValue}
        />);
});

export { UploadTranscriptPresenter };