1+ import React from 'react' ;
2+ import { observer } from "mobx-react-lite" ;
3+ import * as pdfjsLib from "pdfjs-dist" ;
4+ import pdfWorker from "pdfjs-dist/build/pdf.worker?url" ;
5+ import { useState } from "react" ;
6+ import UploadField from '../views/Components/SideBarComponents/UploadField' ;
7+
8+ pdfjsLib . GlobalWorkerOptions . workerSrc = pdfWorker ;
9+
10+ const UploadTranscriptPresenter = observer ( ( { model } ) => {
11+ const [ errorMessage , setErrorMessage ] = useState ( "" ) ; // Stores error message
12+ const [ errorVisibility , setErrorVisibility ] = useState ( "hidden" ) ; // Controls visibility
13+ const [ fileInputValue , setFileInputValue ] = useState ( "" ) ; // Controls upload field state
14+
15+ async function transcriptScraperFunction ( file ) {
16+ console . log ( file ) ;
17+ //const pdfjsLib = window['pdfjsLib'];
18+ //pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.16.105/pdf.worker.min.js';
19+ if ( ! file ) {
20+ console . log ( "element: 'PDF-Scraper-Input' changed, but we havent gotten a file yet." ) ;
21+ return ;
22+ }
23+ if ( file . type !== "application/pdf" ) {
24+ throwTranscriptScraperError ( "Uploaded file isn't PDF." ) ;
25+ return ;
26+ }
27+
28+ setErrorVisibility ( "hidden" ) ;
29+
30+
31+ const arrayBuffer = await file . arrayBuffer ( ) ;
32+ const typedArray = new Uint8Array ( arrayBuffer ) ;
33+ try {
34+ const pdf = await pdfjsLib . getDocument ( { data : typedArray } ) . promise ;
35+ let extractedText = '' ;
36+
37+ //this is our array we are going to work with
38+ let textObjects = [ ] ;
39+
40+
41+ //we will parse the whole pdf page-by-page, and going to push all the content into our array
42+ for ( let pageNum = 1 ; pageNum <= pdf . numPages ; pageNum ++ ) {
43+ const page = await pdf . getPage ( pageNum ) ;
44+ const textContent = await page . getTextContent ( ) ;
45+ //pushing all the text items from the page into our array
46+ textObjects . push ( ...textContent . items ) ;
47+ }
48+
49+
50+ evaluatePDFtextObjectArray ( textObjects ) ;
51+
52+
53+
54+ //document.getElementById('transcript-scraper.js:output').textContent = localStorage.getItem("completedCourses") || 'No matching text found.';
55+ }
56+ catch ( e ) {
57+ throwTranscriptScraperError ( "While parsing the pdf something went wrong." + e ) ;
58+ }
59+ }
60+
61+ function throwTranscriptScraperError ( txt ) {
62+ console . log ( "PDF-Scraper-Error: " + txt ) ;
63+ setErrorMessage ( "Error: " + txt ) ;
64+ setErrorVisibility ( "visible" ) ;
65+ }
66+
67+ function writeLocalStorage_completedCourses ( codesArr ) {
68+ //Getting the local storage contents
69+ let local = [ ] ;
70+ if ( localStorage . getItem ( "completedCourses" ) )
71+ local = JSON . parse ( localStorage . getItem ( "completedCourses" ) ) ;
72+ else {
73+ localStorage . setItem ( "completedCourses" , '[]' ) ;
74+ }
75+
76+ local . sort ( ) ;
77+
78+ let newcodes = local . concat ( codesArr ) ;
79+ newcodes = [ ... new Set ( newcodes ) ] ;
80+
81+
82+ localStorage . setItem ( "completedCourses" , JSON . stringify ( newcodes ) ) ;
83+ console . log ( newcodes ) ;
84+
85+ window . dispatchEvent ( new Event ( "completedCourses changed" ) ) ;
86+ }
87+
88+ function evaluatePDFtextObjectArray ( textObjects ) {
89+ let scrapedCodes = [ ] ;
90+
91+ //initializing couple flags.
92+ let flagKTH = false ;
93+ let flagKTH_NeverSet = true ;
94+ let flagTable = false ;
95+ let flagTableDone = false ;
96+
97+ let flagErrorRecords = false ;
98+
99+ //we are going to go through each text object which is inside the pdf file.
100+ for ( let i = 0 ; i < textObjects . length ; i ++ ) {
101+ //we are going to look for our university, KTH
102+ //current ladok generated National Official transcripts start at xposition 56.692
103+ if ( ( ! flagKTH ) && ( textObjects [ i ] . transform [ 4 ] === 56.692 ) )
104+ if ( ( textObjects [ i ] . str == "Kungliga Tekniska högskolan" ) || ( textObjects [ i ] . str == "KTH Royal Institute of Technology" ) ) {
105+ flagKTH = true ;
106+ flagKTH_NeverSet = false ;
107+ continue ;
108+ }
109+
110+ if ( ( ! flagErrorRecords ) && ( ( textObjects [ i ] . str == "Resultatintyg" ) || ( textObjects [ i ] . str == "Official Transcript of Records" ) ) ) {
111+ flagErrorRecords = true ;
112+ }
113+
114+ if ( flagKTH ) {
115+ //we have found KTH, the very next table containing records should be the one with completed courses
116+ //TODO: this might not be necessarily true, you might need to have a similar code to KTH checker, to check if its
117+ // 'completed courses'/'avslutade kurser'
118+
119+
120+ //the very first text in a table is always Code/Kod; we will start describing it; and we will detect when a new table starts
121+ //and check if its accidentally the same table which just got cut in half by a newline or an actually different table
122+ if ( ( textObjects [ i ] . str === "Code" ) || ( textObjects [ i ] . str === "Kod" ) ) {
123+ if ( flagTable ) flagTableDone = true ; //we have already found one table and transcribed it
124+
125+ if ( ! flagTableDone ) {
126+ flagTable = true ;
127+ } else {
128+ if ( textObjects [ i - 2 ] . transform [ 4 ] !== 497.66899718999997 ) {
129+ //the new table (that is the new found "Kod" / "Code" is not because unexpected page break, therefore we are done transcribing
130+ //KTH courses, these are either uncomplete courses, or courses from other universities
131+ flagTable = false ;
132+ //console.log("----------------------------\nfinished table!");
133+ flagKTH = false ;
134+ }
135+ }
136+ }
137+ //we are looking for text objects which are precisely at x coord 56.692; and also there exists such an element 12 ahead in the array
138+ //which is at coord 510.233; these are hardcoded values into the ladok pdf generator
139+ //for good measures we also make sure the text is not longer that 7 chars; the longest course ID found so far at KTH
140+ if ( ( textObjects [ i ] . transform [ 4 ] === 56.692 ) && ( textObjects [ i + 12 ] . transform [ 4 ] === 510.233 ) && ( textObjects [ i ] . str . length < 8 ) )
141+ if ( flagTable ) {
142+ //console.log(textObjects[i].str, textObjects[i].transform[4]);
143+ //extractedText+= textObjects[i].str + "\n";
144+ scrapedCodes . push ( textObjects [ i ] . str ) ;
145+ }
146+
147+ }
148+
149+ }
150+
151+ if ( flagErrorRecords && ( scrapedCodes . length == 0 ) ) {
152+ throwTranscriptScraperError ( "Provided Official Transcript of Records instead of National Official transcript of records." ) ;
153+ return ;
154+ }
155+
156+ if ( flagKTH_NeverSet ) {
157+ throwTranscriptScraperError ( "Provided pdf doesn't contain KTH." ) ;
158+ return ;
159+ }
160+ //console.log(scrapedCodes);
161+ //console.log(localStorage.getItem("completedCourses"));
162+ if ( scrapedCodes . length == 0 ) {
163+ throwTranscriptScraperError ( "Couldn't find any tables to transcribe." ) ;
164+ return ;
165+ }
166+ writeLocalStorage_completedCourses ( scrapedCodes ) ;
167+ //console.log(localStorage.getItem("completedCourses"));
168+ }
169+
170+ const handleFileChange = ( event ) => {
171+ const file = event . target . files [ 0 ] ;
172+ console . log ( "vliza" ) ;
173+ //document.getElementById('PDF-Scraper-Error').style.visibility = "visible";
174+ transcriptScraperFunction ( file ) ;
175+ //document.getElementById('PDF-Scraper-Input').value = '';
176+ setFileInputValue ( '' ) ;
177+ } ;
178+
179+ return (
180+ < UploadField
181+ errorMessage = { errorMessage }
182+ errorVisibility = { errorVisibility }
183+ handleFileChange = { handleFileChange }
184+ fileInputValue = { fileInputValue }
185+ /> ) ;
186+ } ) ;
187+
188+ export { UploadTranscriptPresenter } ;
0 commit comments