@@ -15,6 +15,7 @@ class ContentParser
1515{
1616 private const ZERO_WIDTH_SPACE = "\x{200B} " ;
1717 private const MIN_WORD_LENGTH = 2 ;
18+ private const NUM_DOC_LINES_SAMPLE = 5 ;
1819
1920 private function cleanWord ($ word )
2021 {
@@ -30,7 +31,7 @@ private function cleanWord($word)
3031
3132 private function parseWordsFromString ($ string )
3233 {
33- $ words = array () ;
34+ $ words = [] ;
3435
3536 for ($ i = 0 ; $ i < strlen ($ string ); $ i ++) {
3637 while ($ i < strlen ($ string ) && ctype_space ($ string [$ i ])) {
@@ -56,19 +57,32 @@ private function parseWordsFromString($string)
5657 return $ words ;
5758 }
5859
59- private function parseLine ($ line )
60+ private function parseLine (string $ line, bool $ docIsNumbered )
6061 {
6162 $ zeroWidthSpacePattern = '/ ' . self ::ZERO_WIDTH_SPACE . '/u ' ;
6263 $ line = preg_replace ($ zeroWidthSpacePattern , '' , $ line );
6364 $ lineWords = $ this ->parseWordsFromString ($ line );
6465
65- if (!empty ($ lineWords ) && is_numeric ($ lineWords [0 ])) {
66+ if ($ docIsNumbered && !empty ($ lineWords ) && is_numeric ($ lineWords [0 ])) {
6667 array_shift ($ lineWords );
6768 }
6869
6970 return $ lineWords ;
7071 }
7172
73+ public function checkDocumentIsNumbered (array $ docLines ): bool
74+ {
75+ for ($ i = 0 ; $ i < self ::NUM_DOC_LINES_SAMPLE ; $ i ++) {
76+ $ parsedLine = explode (' ' , $ docLines [$ i ]);
77+ $ firstWord = $ parsedLine [0 ];
78+
79+ if (!is_numeric ($ firstWord )) {
80+ return false ;
81+ }
82+ }
83+ return true ;
84+ }
85+
7286 public function parseDocument ($ pathFile , $ useRawMode = true )
7387 {
7488 $ pathTxt = substr ($ pathFile , 0 , -3 ) . 'txt ' ;
@@ -77,19 +91,21 @@ public function parseDocument($pathFile, $useRawMode = true)
7791
7892 $ docText = file_get_contents ($ pathTxt );
7993 $ docLines = preg_split ("/ \r\n| \n| \r/ " , $ docText );
80- $ docWords = array () ;
94+ $ docWords = [] ;
8195 unlink ($ pathTxt );
8296
97+ $ docIsNumbered = $ this ->checkDocumentIsNumbered ($ docLines );
98+
8399 foreach ($ docLines as $ line ) {
84- $ docWords = array_merge ($ docWords , $ this ->parseLine ($ line ));
100+ $ docWords = array_merge ($ docWords , $ this ->parseLine ($ line, $ docIsNumbered ));
85101 }
86102
87103 return $ docWords ;
88104 }
89105
90106 public function createPatternFromString ($ string )
91107 {
92- $ pattern = array () ;
108+ $ pattern = [] ;
93109
94110 for ($ i = 0 ; $ i < strlen ($ string ); $ i ++) {
95111 while ($ i < strlen ($ string ) && ctype_space ($ string [$ i ])) {
0 commit comments