Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ twine upload dist/*
| Ruby | text/x-ruby |
| Shell | text/x-shellscript |
| XML | text/xml |
| PHP | text/x-php |

And more to come!

Expand Down
3 changes: 3 additions & 0 deletions comment_parser/comment_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
Javascript
Ruby
XML
PHP

Dependencies:
python-magic: pip install python-magic (optional)
Expand All @@ -32,6 +33,7 @@
from comment_parser.parsers import python_parser
from comment_parser.parsers import ruby_parser
from comment_parser.parsers import shell_parser
from comment_parser.parsers import php_parser

MIME_MAP = {
'application/javascript': js_parser, # Javascript
Expand All @@ -46,6 +48,7 @@
'text/x-ruby': ruby_parser, # Ruby
'text/x-shellscript': shell_parser, # Unix shell
'text/xml': html_parser, # XML
'text/x-php': php_parser, # PHP
}


Expand Down
72 changes: 72 additions & 0 deletions comment_parser/parsers/php_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/python
"""This module provides methods for parsing comments from PHP.

Works with:
PHP
"""

import re
from bisect import bisect_left
from comment_parser.parsers import common


def extract_comments(code):
"""Extracts a list of comments from the given PHP source code.

Comments are represented with the Comment class found in the common module.
PHP comments come in two forms, single and multi-line comments.
- Single-line comments begin with '//' or '#' and continue to the end of line.
- Multi-line comments begin with '/*' and end with '*/' and can span
multiple lines of code. If a multi-line comment does not terminate
before EOF is reached, then an exception is raised.

Note that this doesn't take language-specific preprocessor directives into
consideration.

Args:
code: String containing code to extract comments from.
Returns:
Python list of common.Comment in the order that they appear in the code.
Raises:
common.UnterminatedCommentError: Encountered an unterminated multi-line
comment.
"""
pattern = r"""
(?P<literal> (?:([\"'])((?:\\\2|(?:(?!\2)).|\n)*)(\2))|\?>((?!<\?php\s).|\n)*<\?php\s|<<<('?)(([a-zA-Z0-9_]|[^\x00-\x7F])([a-zA-Z0-9_]|[^\x00-\x7F])*)\6((?!^\7;?$)(.|\n))*^\7;?$) |
(?P<single> (?://|\#)(?P<single_content>.*)?$) |
(?P<multi> /\*(?P<multi_content>(.|\n)*?)?\*/) |
(?P<error> /\*(.*)?)
"""

compiled = re.compile(pattern, re.VERBOSE | re.MULTILINE)
# The regex recognizes stuff between ?> and <?php as literal
# The following wrapping sets the expectation to be outside of php tags at the start
# and deals with the state, where the php tag is not open at the end of the file
code = "?>\n" + code + "\n<?php "

lines_indexes = []
for match in re.finditer(r"$", code, re.M):
lines_indexes.append(match.start())

comments = []
for match in compiled.finditer(code):
kind = match.lastgroup

start_character = match.start()
line_no = bisect_left(lines_indexes, start_character)

if kind == "single":
comment_content = match.group("single_content")
comment = common.Comment(comment_content,
line_no) # Line number is increased by wrapping
comments.append(comment)
elif kind == "multi":
comment_content = match.group("multi_content")
comment = common.Comment(
comment_content, line_no,
multiline=True) # Line number is increased by wrapping
comments.append(comment)
elif kind == "error":
raise common.UnterminatedCommentError()

return comments
172 changes: 172 additions & 0 deletions comment_parser/parsers/tests/php_parser_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#!/usr/bin/python
"""Tests for comment_parser.parsers.c_parser.py"""

import unittest
from comment_parser.parsers import common
from comment_parser.parsers import php_parser


class PHPParserTest(unittest.TestCase):

def testSimpleSingleLineComment(self):
code = """<?php
// this is a comment
echo "Hello World";"""
comments = php_parser.extract_comments(code)
expected = [common.Comment(" this is a comment", 2, multiline=False)]
self.assertEqual(comments, expected)

def testOtherSingleLineComment(self):
code = """<?php
# this is a comment
echo "Hello World";"""
comments = php_parser.extract_comments(code)
expected = [common.Comment(" this is a comment", 2, multiline=False)]
self.assertEqual(comments, expected)

def testSingleLineCommentInStringLiteral(self):
code = '''<?php
echo "// this is not a comment";'''
comments = php_parser.extract_comments(code)
self.assertEqual(comments, [])

def testMultiLineComment(self):
code = '''<?php
/* multiline\ncomment */'''
comments = php_parser.extract_comments(code)
expected = [common.Comment(' multiline\ncomment ', 2, multiline=True)]
self.assertEqual(comments, expected)

def testMultiLineCommentWithStars(self):
code = """<?php
/***************/"""
comments = php_parser.extract_comments(code)
expected = [common.Comment("*************", 2, multiline=True)]
self.assertEqual(comments, expected)

def testMultiLineCommentInStringLiteral(self):
code = '''<?php
echo "/* This is not a\\nmultiline comment */";'''
comments = php_parser.extract_comments(code)
self.assertEqual(comments, [])

def testMultiLineCommentUnterminated(self):
code = '''<?php
$a = 1; /* Unterminated\\n comment'''
self.assertRaises(common.UnterminatedCommentError,
php_parser.extract_comments, code)

def testMultipleMultilineComments(self):
code = '''<?php
/* abc */ /* 123 */'''
comments = php_parser.extract_comments(code)
expected = [
common.Comment(' abc ', 2, multiline=True),
common.Comment(' 123 ', 2, multiline=True),
]
self.assertEqual(comments, expected)

def testStringThenComment(self):
code = '''<?php
echo "" /* "abc */;'''
comments = php_parser.extract_comments(code)
expected = [
common.Comment(' "abc ', 2, multiline=True),
]
self.assertEqual(comments, expected)

def testCommentStartInsideEscapedQuotesInStringLiteral(self):
# TODO(#27): Re-enable test.
# code = r'" \" /* \" "'
# comments = c_parser.extract_comments(code)
# self.assertEqual(comments, [])
pass

def testStringEscapedBackslashCharacter(self):
code = r'''<?php
echo "\\"; # This wouldn't be a comment, if the second " is misinterpreted as escaped
'''
comments = php_parser.extract_comments(code)
self.assertEqual(comments, [])

def testCommentedOtherComment(self):
code = '''<?php
//# double comment'''
comments = php_parser.extract_comments(code)
self.assertEqual(comments, [common.Comment('# double comment', 2)])

def testOtherCommentedComment(self):
code = '''<?php
#// double comment'''
comments = php_parser.extract_comments(code)
self.assertEqual(comments, [common.Comment('// double comment', 2)])

def testNoPhpTag(self):
code = '''#// double comment'''
comments = php_parser.extract_comments(code)
self.assertEqual(comments, [])

def testCommentedPhpTag(self):
code = '''<?php
# ?>
/* Wouldn't be a commend if commented php end tag was misinterpreted */'''
comments = php_parser.extract_comments(code)
expected = [
common.Comment(' ?>', 2, multiline=False),
common.Comment(
" Wouldn't be a commend if commented php end tag was misinterpreted ",
3,
multiline=True)
]
self.assertEqual(comments, expected)

def testCommentsOutsidePhpTag(self):
code = '''<?php echo "Hi";
?>
// This is no comment <?php
// But this is'''
comments = php_parser.extract_comments(code)
expected = [common.Comment(' But this is', 4, multiline=False)]
self.assertEqual(comments, expected)

def testMultilineString(self):
code = '''<?php echo "Hi
// Multi
# Line
?>
<?php
/* String */
";'''
comments = php_parser.extract_comments(code)
self.assertEqual(comments, [])

def testHereDocString(self):
code = '''<?php echo <<<Aä_0
// Multi "
# Line
?>
<?php
/* String */"\nAä_0;'''
comments = php_parser.extract_comments(code)
self.assertEqual(comments, [])

def testFakeHereDocString(self):
code = '''<?php echo "<<<Aä_0
// Multi ";
# Line
?>
\nAä_0;\n
<?php
/* String */"'''
comments = php_parser.extract_comments(code)
expect = [
common.Comment(" Line", 3, False),
common.Comment(" String ", 9, True)
]
self.assertEqual(comments, expect)

def testMultilineCommentWithEmptyFirstLine(self):
code = '<?php\n/*\n| Comment Roll\n*/\nint main(){return 0;}'
comments = php_parser.extract_comments(code)
expected = [common.Comment('\n| Comment Roll\n', 2, True)]
self.assertEqual(comments, expected)