Skip to content

Commit e9810de

Browse files
Merge branch 'fixDataDetection340-901' into 'stable-3_4_0'
Estende a verificação do PDF para utilizar dupla-conversão do PDF - 3.4.0 See merge request softwares-pkp/plugins_ojs/verificacao-metadados-documento!39
2 parents 3e82b45 + 6a8822e commit e9810de

4 files changed

Lines changed: 53 additions & 17 deletions

File tree

classes/ContentParser.php

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,11 @@ private function parseLine($line)
6969
return $lineWords;
7070
}
7171

72-
public function parseDocument($pathFile)
72+
public function parseDocument($pathFile, $useRawMode = true)
7373
{
7474
$pathTxt = substr($pathFile, 0, -3) . 'txt';
75-
shell_exec("pdftotext " . $pathFile . " " . $pathTxt . " -raw 2>/dev/null");
75+
$command = "pdftotext $pathFile $pathTxt" . ($useRawMode ? ' -raw ' : '') . " 2>/dev/null";
76+
shell_exec($command);
7677

7778
$docText = file_get_contents($pathTxt);
7879
$docLines = preg_split("/\r\n|\n|\r/", $docText);

classes/DocumentChecker.php

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@ class DocumentChecker
1717
{
1818
private $pathFile;
1919
public $words;
20+
public $secondaryWords;
2021

2122
public function __construct($path)
2223
{
2324
$this->pathFile = $path;
2425
$parser = new ContentParser();
2526
$this->words = $parser->parseDocument($path);
27+
$this->secondaryWords = $parser->parseDocument($path, false);
2628
}
2729

2830
private $patternsContribution = [
@@ -125,14 +127,14 @@ private function checkOrcid($text)
125127
["dados", "da", "pesquisa"]
126128
];
127129

128-
private function checkForPattern($patterns, $limit, $limiarForWord, $limiarForPattern)
130+
private function checkForPatternsInWordList($wordList, $patterns, $limit, $limiarForWord, $limiarForPattern)
129131
{
130-
for ($i = 0; $i < count($this->words) - $limit; $i++) {
132+
for ($i = 0; $i < count($wordList) - $limit; $i++) {
131133
for ($j = 0; $j < count($patterns); $j++) {
132134
$depth = $similarWords = 0;
133135

134136
foreach ($patterns[$j] as $wordPattern) {
135-
$wordFromText = $this->words[$i + $depth];
137+
$wordFromText = $wordList[$i + $depth];
136138
similar_text($wordFromText, $wordPattern, $similarity);
137139
$depth++;
138140

@@ -152,9 +154,20 @@ private function checkForPattern($patterns, $limit, $limiarForWord, $limiarForPa
152154
return 'Error';
153155
}
154156

157+
public function checkForPatterns($patterns, $limit, $limiarForWord, $limiarForPattern)
158+
{
159+
$result = $this->checkForPatternsInWordList($this->words, $patterns, $limit, $limiarForWord, $limiarForPattern);
160+
161+
if ($result == 'Error') {
162+
$result = $this->checkForPatternsInWordList($this->secondaryWords, $patterns, $limit, $limiarForWord, $limiarForPattern);
163+
}
164+
165+
return $result;
166+
}
167+
155168
public function checkAuthorsContribution()
156169
{
157-
return $this->checkForPattern($this->patternsContribution, 3, 75, 1);
170+
return $this->checkForPatterns($this->patternsContribution, 3, 75, 1);
158171
}
159172

160173
public function checkTextOrcidsNumber()
@@ -176,17 +189,17 @@ public function checkTextOrcidsNumber()
176189

177190
public function checkConflictInterest()
178191
{
179-
return $this->checkForPattern($this->patternsConflictInterest, 3, 75, 1);
192+
return $this->checkForPatterns($this->patternsConflictInterest, 3, 75, 1);
180193
}
181194

182195
public function checkKeywordsInEnglish()
183196
{
184-
return $this->checkForPattern($this->patternsKeywordsEnglish, 2, 92, 1);
197+
return $this->checkForPatterns($this->patternsKeywordsEnglish, 2, 92, 1);
185198
}
186199

187200
public function checkAbstractInEnglish()
188201
{
189-
return $this->checkForPattern($this->patternsAbstractEnglish, 2, 92, 1);
202+
return $this->checkForPatterns($this->patternsAbstractEnglish, 2, 92, 1);
190203
}
191204

192205
public function checkTitleInEnglish($title)
@@ -199,16 +212,16 @@ public function checkTitleInEnglish($title)
199212
$cleanedTitle = $parser->cleanStyledText($title);
200213
$patternTitle = $parser->createPatternFromString($cleanedTitle);
201214

202-
return $this->checkForPattern(array($patternTitle), count($patternTitle), 75, 0.75);
215+
return $this->checkForPatterns(array($patternTitle), count($patternTitle), 75, 0.75);
203216
}
204217

205218
public function checkEthicsCommittee()
206219
{
207-
return $this->checkForPattern($this->patternsEthicsCommittee, 2, 75, 1);
220+
return $this->checkForPatterns($this->patternsEthicsCommittee, 2, 75, 1);
208221
}
209222

210223
public function checkDataStatement()
211224
{
212-
return $this->checkForPattern($this->patternsDataStatement, 3, 90, 1);
225+
return $this->checkForPatterns($this->patternsDataStatement, 3, 90, 1);
213226
}
214227
}

tests/DetectionOnDocumentTest.php

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
use PHPUnit\Framework\TestCase;
66
use APP\plugins\generic\contentAnalysis\classes\DocumentChecker;
77

8-
abstract class DetectionOnDocumentTest extends TestCase
8+
class DetectionOnDocumentTest extends TestCase
99
{
1010
protected const FIXTURES_PATH = __DIR__ . DIRECTORY_SEPARATOR . 'fixtures' . DIRECTORY_SEPARATOR;
1111
protected $documentChecker;
@@ -28,8 +28,30 @@ protected function insertWordsIntoDocWordList($words, $docWordList)
2828
);
2929
}
3030

31-
protected function insertStringIntoTextHtml($string, $textHtml)
31+
public function testCheckerHasWords(): void
3232
{
33-
return $textHtml . " " . $string;
33+
$this->assertNotNull($this->documentChecker->words);
34+
$this->assertNotEmpty($this->documentChecker->words);
35+
36+
$this->assertNotNull($this->documentChecker->secondaryWords);
37+
$this->assertNotEmpty($this->documentChecker->secondaryWords);
38+
}
39+
40+
public function testGeneralPatternDetectionOnPrimaryWords(): void
41+
{
42+
$pattern = ['expected', 'pattern', 'to', 'be', 'found'];
43+
$backupSecondaryWords = $this->documentChecker->secondaryWords;
44+
45+
$checkResult = $this->documentChecker->checkForPatterns([$pattern], 5, 50, 1);
46+
$this->assertEquals('Error', $checkResult);
47+
48+
$this->documentChecker->secondaryWords = $this->insertWordsIntoDocWordList($pattern, $this->documentChecker->secondaryWords);
49+
$checkResult = $this->documentChecker->checkForPatterns([$pattern], 5, 50, 1);
50+
$this->assertEquals('Success', $checkResult);
51+
52+
$this->documentChecker->secondaryWords = $backupSecondaryWords;
53+
$this->documentChecker->words = $this->insertWordsIntoDocWordList($pattern, $this->documentChecker->words);
54+
$checkResult = $this->documentChecker->checkForPatterns([$pattern], 5, 50, 1);
55+
$this->assertEquals('Success', $checkResult);
3456
}
3557
}

version.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
<version>
1313
<application>contentAnalysis</application>
1414
<type>plugins.generic</type>
15-
<release>2.2.5.0</release>
16-
<date>2026-02-27</date>
15+
<release>2.2.6.0</release>
16+
<date>2026-04-23</date>
1717
<lazy-load>1</lazy-load>
1818
<class>ContentAnalysisPlugin</class>
1919
</version>

0 commit comments

Comments
 (0)