Skip to content

Commit 3067fd1

Browse files
Improves detection of numbered documents
Also removes all uses of 'array' function Issue: documentacao-e-tarefas/scielo#912 Signed-off-by: Jhon <jhon@lepidus.com.br>
1 parent e060dcf commit 3067fd1

6 files changed

Lines changed: 28 additions & 12 deletions

File tree

classes/ContentParser.php

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class ContentParser
1515
{
1616
private const ZERO_WIDTH_SPACE = "\x{200B}";
1717
private const MIN_WORD_LENGTH = 2;
18+
private const NUM_DOC_LINES_SAMPLE = 5;
1819

1920
private function cleanWord($word)
2021
{
@@ -30,7 +31,7 @@ private function cleanWord($word)
3031

3132
private function parseWordsFromString($string)
3233
{
33-
$words = array();
34+
$words = [];
3435

3536
for ($i = 0; $i < strlen($string); $i++) {
3637
while ($i < strlen($string) && ctype_space($string[$i])) {
@@ -56,19 +57,32 @@ private function parseWordsFromString($string)
5657
return $words;
5758
}
5859

59-
private function parseLine($line)
60+
private function parseLine(string $line, bool $docIsNumbered)
6061
{
6162
$zeroWidthSpacePattern = '/' . self::ZERO_WIDTH_SPACE . '/u';
6263
$line = preg_replace($zeroWidthSpacePattern, '', $line);
6364
$lineWords = $this->parseWordsFromString($line);
6465

65-
if (!empty($lineWords) && is_numeric($lineWords[0])) {
66+
if ($docIsNumbered && !empty($lineWords) && is_numeric($lineWords[0])) {
6667
array_shift($lineWords);
6768
}
6869

6970
return $lineWords;
7071
}
7172

73+
public function checkDocumentIsNumbered(array $docLines): bool
74+
{
75+
for ($i = 0; $i < self::NUM_DOC_LINES_SAMPLE; $i++) {
76+
$parsedLine = explode(' ', $docLines[$i]);
77+
$firstWord = $parsedLine[0];
78+
79+
if (!is_numeric($firstWord)) {
80+
return false;
81+
}
82+
}
83+
return true;
84+
}
85+
7286
public function parseDocument($pathFile, $useRawMode = true)
7387
{
7488
$pathTxt = substr($pathFile, 0, -3) . 'txt';
@@ -77,19 +91,21 @@ public function parseDocument($pathFile, $useRawMode = true)
7791

7892
$docText = file_get_contents($pathTxt);
7993
$docLines = preg_split("/\r\n|\n|\r/", $docText);
80-
$docWords = array();
94+
$docWords = [];
8195
unlink($pathTxt);
8296

97+
$docIsNumbered = $this->checkDocumentIsNumbered($docLines);
98+
8399
foreach ($docLines as $line) {
84-
$docWords = array_merge($docWords, $this->parseLine($line));
100+
$docWords = array_merge($docWords, $this->parseLine($line, $docIsNumbered));
85101
}
86102

87103
return $docWords;
88104
}
89105

90106
public function createPatternFromString($string)
91107
{
92-
$pattern = array();
108+
$pattern = [];
93109

94110
for ($i = 0; $i < strlen($string); $i++) {
95111
while ($i < strlen($string) && ctype_space($string[$i])) {

classes/DocumentChecker.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ public function checkTitleInEnglish($title)
212212
$cleanedTitle = $parser->cleanStyledText($title);
213213
$patternTitle = $parser->createPatternFromString($cleanedTitle);
214214

215-
return $this->checkForPatterns(array($patternTitle), count($patternTitle), 75, 0.75);
215+
return $this->checkForPatterns([$patternTitle], count($patternTitle), 75, 0.75);
216216
}
217217

218218
public function checkEthicsCommittee()

classes/DocumentChecklist.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public function __construct($path)
2424

2525
public function executeChecklist($submission)
2626
{
27-
$dataChecklist = array();
27+
$dataChecklist = [];
2828
$submissionIsArticle = !$submission->getData('nonArticle');
2929

3030
if ($submissionIsArticle) {

tests/AuthorsContributionTest.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
class AuthorsContributionTest extends DetectionOnDocumentTest
77
{
8-
private $patternContribution = array("contribuição", "dos", "autores");
8+
private $patternContribution = ["contribuição", "dos", "autores"];
99

1010
public function setUp(): void
1111
{

tests/EthicsCommitteeTest.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
class EthicsCommitteeTest extends DetectionOnDocumentTest
77
{
8-
private $patternCommittee = array("aprovação", "do", "comitê", "de", "ética");
8+
private $patternCommittee = ["aprovação", "do", "comitê", "de", "ética"];
99

1010
public function setUp(): void
1111
{

tests/MetadataEnglishTest.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66

77
class MetadataEnglishTest extends DetectionOnDocumentTest
88
{
9-
private $patternKeywords = array("keywords");
10-
private $patternAbstract = array("abstract");
9+
private $patternKeywords = ["keywords"];
10+
private $patternAbstract = ["abstract"];
1111
private $title = "A beautiful title";
1212
private $titleWithStyling = "<b>A</b> <i>beautiful</i> <u>title</u>";
1313

0 commit comments

Comments
 (0)