diff --git a/analyzers/MsgParser/Msg_Parser.json b/analyzers/MsgParser/Msg_Parser.json
index 06d5acccc..905a9692d 100644
--- a/analyzers/MsgParser/Msg_Parser.json
+++ b/analyzers/MsgParser/Msg_Parser.json
@@ -1,11 +1,11 @@
{
"name": "Msg_Parser",
- "version": "3.0",
- "author": "CERT-BDF",
+ "version": "4.0",
+ "author": "Waltyon",
"url": "https://github.com/TheHive-Project/Cortex-Analyzers",
"license": "AGPL-V3",
"description": "Parse Outlook MSG files and extract the main artifacts.",
"dataTypeList": ["file"],
"baseConfig": "MsgParser",
"command": "MsgParser/parse.py"
-}
+}
\ No newline at end of file
diff --git a/analyzers/MsgParser/lib/__init__.py b/analyzers/MsgParser/lib/__init__.py
deleted file mode 100755
index 35e377ec3..000000000
--- a/analyzers/MsgParser/lib/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# __import__('pkg_resources').declare_namespace(__name__)
diff --git a/analyzers/MsgParser/lib/msgParser.py b/analyzers/MsgParser/lib/msgParser.py
deleted file mode 100755
index 00ad5356d..000000000
--- a/analyzers/MsgParser/lib/msgParser.py
+++ /dev/null
@@ -1,300 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-
-# --- LICENSE -----------------------------------------------------------------
-#
-# Copyright 2013 Matthew Walker
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see .
-
-import json
-import os
-import sys
-import glob
-import traceback
-from email.parser import Parser as EmailParser
-import email.utils
-import olefile as OleFile
-
-
-class Attachment:
-
- def __init__(self, msg, dir_):
-
- # print dir_
-
- # Get long filename
- self.longFilename = msg._getStringStream([dir_, '__substg1.0_3707'])
- # print self.longFilename
-
- # Get short filename
- self.shortFilename = msg._getStringStream([dir_, '__substg1.0_3704'])
-
- # Get attachment data
- self.data = msg._getStream([dir_, '__substg1.0_37010102'])
-
- # Get short mimeTag
- self.mimeTag = msg._getStringStream([dir_, '__substg1.0_370E'])
-
- # Get extension
- self.extension = msg._getStringStream([dir_, '__substg1.0_3703'])
-
- def save(self):
- # Use long filename as first preference
- filename = self.longFilename
-
- # Otherwise use the short filename
- if filename is None:
- filename = self.shortFilename
- # Otherwise just make something up!
- if filename is None:
- import random
- import string
- filename = 'UnknownFilename ' + \
- ''.join(random.choice(string.ascii_uppercase + string.digits)
- for _ in range(5)) + ".bin"
- #f = open("/tmp/" + filename, 'wb')
- # if self.data is None:
- #f.write(("Pas de PJ"))
- # f.close()
- # else:
- # f.write((self.data))
- # f.close()
- # return filename
-
-
-def windowsUnicode(string):
- if string is None:
- return None
- if sys.version_info[0] >= 3: # Python 3
- return str(string, 'utf_16_le')
- else: # Python 2
- return unicode(string, 'utf_16_le')
-
-
-class Message(OleFile.OleFileIO):
-
- def __init__(self, filename):
- OleFile.OleFileIO.__init__(self, filename)
-
- def _getStream(self, filename):
- if self.exists(filename):
- stream = self.openstream(filename)
- return stream.read()
- else:
- return None
-
- def _getStringStream(self, filename, prefer='unicode'):
- """Gets a string representation of the requested filename.
- Checks for both ASCII and Unicode representations and returns
- a value if possible. If there are both ASCII and Unicode
- versions, then the parameter /prefer/ specifies which will be
- returned.
- """
-
- if isinstance(filename, list):
- # Join with slashes to make it easier to append the type
- filename = "/".join(filename)
-
- asciiVersion = self._getStream(filename + '001E')
- unicodeVersion = windowsUnicode(self._getStream(filename + '001F'))
- if asciiVersion is None:
- return unicodeVersion
- elif unicodeVersion is None:
- return asciiVersion.decode('ascii', 'ignore')
- else:
- if prefer == 'unicode':
- return unicodeVersion
- else:
- return asciiVersion.decode('ascii', 'ignore')
-
- @property
- def subject(self):
- return self._getStringStream('__substg1.0_0037')
-
- @property
- def header(self):
- try:
- return self._header
- except Exception:
- headerText = self._getStringStream('__substg1.0_007D')
- if headerText is not None:
- self._header = EmailParser().parsestr(headerText)
- else:
- self._header = None
- return self._header
-
- @property
- def date(self):
- # Get the message's header and extract the date
- if self.header is None:
- return None
- else:
- return self.header['date']
-
- @property
- def parsedDate(self):
- return email.utils.parsedate(self.date)
-
- @property
- def attachments(self):
- try:
- return self._attachments
- except Exception:
- # Get the attachments
- attachmentDirs = []
-
- for dir_ in self.listdir():
- if dir_[0].startswith('__attach') and dir_[0] not in attachmentDirs:
- attachmentDirs.append(dir_[0])
-
- self._attachments = []
-
- for attachmentDir in attachmentDirs:
- self._attachments.append(Attachment(self, attachmentDir))
-
- return self._attachments
-
- @property
- def sender(self):
- try:
- return self._sender
- except Exception:
- # Check header first
- if self.header is not None:
- headerResult = self.header["from"]
- if headerResult is not None:
- self._sender = headerResult
- return headerResult
-
- # Extract from other fields
- text = self._getStringStream('__substg1.0_0C1A')
- email = self._getStringStream('__substg1.0_0C1F')
- result = None
- if text is None:
- result = email
- else:
- result = text
- if email is not None:
- result = result + " <" + email + ">"
-
- self._sender = result
- return result
-
- @property
- def to(self):
- try:
- return self._to
- except Exception:
- # Check header first
- if self.header is not None:
- headerResult = self.header["to"]
- if headerResult is not None:
- self._to = headerResult
- return headerResult
-
- # Extract from other fields
- # TODO: This should really extract data from the recip folders,
- # but how do you know which is to/cc/bcc?
- display = self._getStringStream('__substg1.0_0E04')
- self._to = display
- return display
-
- @property
- def cc(self):
- try:
- return self._cc
- except Exception:
- # Check header first
- if self.header is not None:
- headerResult = self.header["cc"]
- if headerResult is not None:
- self._cc = headerResult
- return headerResult
-
- # Extract from other fields
- # TODO: This should really extract data from the recip folders,
- # but how do you know which is to/cc/bcc?
- display = self._getStringStream('__substg1.0_0E03')
- self._cc = display
- return display
-
- @property
- def body(self):
- return self._getStringStream('__substg1.0_1000')
-
- @property
- def sujet(self):
- return self._getStringStream('__substg1.0_0037')
-
- @property
- def recupar(self):
- return self._getStringStream('__substg1.0_0040')
-
- @property
- def nomaffichefrom(self):
- return self._getStringStream('__substg1.0_0042')
-
- @property
- def Recupar(self):
- return self._getStringStream('__substg1.0_0044')
-
- @property
- def Lesender(self):
- return self._getStringStream('__substg1.0_0065')
-
- @property
- def lobjet(self):
- return self._getStringStream('__substg1.0_0070')
-
- @property
- def lentete(self):
- return self._getStringStream('__substg1.0_007d')
-
- @property
- def bcc(self):
- return self._getStringStream('__substg1.0_0E02')
-
- @property
- def displayto(self):
- return self._getStringStream('__substg1.0_0E04')
-
- def dump(self):
- # Prints out a summary of the message
- print('Message')
- print('Subject:', self.subject)
- print('Date:', self.date)
- print('Body:')
- print(self.body)
- print('Recu par: ', self.recupar)
- print('Nom affiche dans le from: %s' % self.nomaffichefrom)
- print('Le sender: ', self.Lesender)
- print('lobjet: ', self.lobjet)
- print('lentete: ', self.lentete)
- print('bcc: ', self.bcc)
- print('display to: ', self.displayto)
-
- def getReport(self):
- result = {"subject": self.subject, "date": self.date, "receivers": self.recupar, "displayFrom": self.nomaffichefrom,
- "sender": self.Lesender, "topic": self.lobjet, "bcc": self.bcc, "displayTo": self.displayto,
- "headers": self.lentete, "body": self.body}
-
- attachments = []
- for attachment in self.attachments:
- attachments.append({"filename": attachment.longFilename,
- "mime": attachment.mimeTag, "extension": attachment.extension})
-
- result["attachments"] = attachments
-
- return result
diff --git a/analyzers/MsgParser/parse.py b/analyzers/MsgParser/parse.py
index d12c71343..de498a905 100755
--- a/analyzers/MsgParser/parse.py
+++ b/analyzers/MsgParser/parse.py
@@ -1,40 +1,116 @@
#!/usr/bin/env python3
# encoding: utf-8
-from lib.msgParser import Message
from cortexutils.analyzer import Analyzer
-
+from outlook_msg import Message
+import iocextract
+import extract_msg
+import tempfile
+import hashlib
class MsgParserAnalyzer(Analyzer):
-
+
def __init__(self):
Analyzer.__init__(self)
-
- self.filename = self.get_param('filename', 'noname.ext')
- self.filepath = self.get_param('file', None, 'File is missing')
+ self.filepath = self.get_param('file', None, 'File is missing')
def summary(self, raw):
taxonomies = []
- level = "info"
- namespace = "MsgParser"
- predicate = "Attachments"
- value = "0"
- if "attachments" in raw:
- value = len(raw["attachments"])
- taxonomies.append(self.build_taxonomy(level, namespace, predicate, value))
+ if 'attachments' in raw:
+ taxonomies.append(self.build_taxonomy('info', 'MsgParser', 'Attachments', len(raw['attachments'])))
+
+ return { 'taxonomies': taxonomies }
+
+ # @brief Bringing up observables from the mail to TheHive
+ def artifacts(self, raw):
+ artifacts = []
+ urls = list(set(iocextract.extract_urls(str(raw))))
+ ipv4s = list(set(iocextract.extract_ipv4s(str(raw))))
+ mail_addresses = list(set(iocextract.extract_emails(str(raw))))
+ hashes = list(set(iocextract.extract_hashes(str(raw))))
+
+ # Extract each attachment to send as an observable
+ for attachment in self.attachments_paths:
+ artifacts.append(self.build_artifact('file', attachment, tlp=3))
+
+ for u in urls:
+ artifacts.append(self.build_artifact('url', str(u)))
+
+ for i in ipv4s:
+ artifacts.append(self.build_artifact('ip', str(i)))
+
+ for e in mail_addresses:
+ artifacts.append(self.build_artifact('mail', str(e)))
+
+ for h in hashes:
+ artifacts.append(self.build_artifact('hash', str(h)))
+
+ # Cleanup the temporary folder
+ self.temp_dir.cleanup()
+
+ return artifacts
+
+
+ # @brief Returns the hash of the input file
+ # @param data_bytes: content of the file readed
+ # @param mode: Hash algorithms mode
+ def get_hash(self, data_bytes, mode='md5'):
+ h = hashlib.new(mode)
+ h.update(data_bytes)
+ digest = h.hexdigest()
+ return digest
- return {"taxonomies": taxonomies}
+ # @brief Main function to retrieve mail information and attachments
+ def parseMsg(self):
+
+ # Extract all information from the mail with extract_msg
+ msg = extract_msg.Message(self.filepath)
+
+ result = dict()
+ result['subject'] = str(msg.subject)
+ result['date'] = str(msg.date)
+ result['receivers'] = str(msg.to)
+ result['sender'] = str(msg.sender)
+ result['bcc'] = str(msg.bcc)
+ result['headers'] = str(msg.header)
+ result['body'] = str(msg.body)
+ result['MessageID'] = str(msg.messageId)
+ result['XoriginatingIP'] = str(msg.header.get('x-originating-ip'))
+
+ result['attachments'] = list()
+
+ # Retrieves the list of attachments and saves them in a temporary folder.
+ # Then for each attachment, calculates the different Hash of the attachment
+ self.attachments_paths = []
+ self.temp_dir = tempfile.TemporaryDirectory()
+
+ with open(self.filepath) as msg_file:
+ msg = Message(msg_file)
+
+ for an_attachment in msg.attachments:
+ attachment_name = '{}/{}'.format(str(self.temp_dir.name), str(an_attachment.filename))
+ self.attachments_paths.append(attachment_name)
+
+ with an_attachment.open() as attachment_fp, open(attachment_name, 'wb') as output_fp:
+ data = attachment_fp.read()
+ output_fp.write(data)
+ attachment_sum_up = dict()
+ attachment_sum_up['filename'] = attachment_name.split('/')[-1]
+ # Calculates the hash of each attachment
+ attachment_sum_up['md5'] = self.get_hash(data, 'md5')
+ attachment_sum_up['sha1'] = self.get_hash(data, 'sha1')
+ attachment_sum_up['sha256'] = self.get_hash(data, 'sha256')
+ result['attachments'].append(attachment_sum_up)
+
+ return result
def run(self):
- if self.data_type == 'file':
- try:
- self.report(Message(self.filepath).getReport())
- except Exception as e:
- self.unexpectedError(e)
+ if self.data_type == 'file':
+ parsingResult = self.parseMsg()
+ self.report(parsingResult)
else:
self.notSupported()
-
-
+
if __name__ == '__main__':
MsgParserAnalyzer().run()
diff --git a/analyzers/MsgParser/requirements.txt b/analyzers/MsgParser/requirements.txt
index 1a17a0ad5..227395cd9 100644
--- a/analyzers/MsgParser/requirements.txt
+++ b/analyzers/MsgParser/requirements.txt
@@ -1,2 +1,5 @@
cortexutils
olefile
+extract-msg
+iocextract
+outlook-msg
\ No newline at end of file
diff --git a/thehive-templates/Msg_Parser_3_0/short.html b/thehive-templates/Msg_Parser_3_0/short.html
deleted file mode 100644
index 5fc0dabfb..000000000
--- a/thehive-templates/Msg_Parser_3_0/short.html
+++ /dev/null
@@ -1,3 +0,0 @@
-
- {{t.namespace}}:{{t.predicate}}="{{t.value}}"
-
diff --git a/thehive-templates/Msg_Parser_3_0/long.html b/thehive-templates/Msg_Parser_4_0/long.html
similarity index 62%
rename from thehive-templates/Msg_Parser_3_0/long.html
rename to thehive-templates/Msg_Parser_4_0/long.html
index 1291294fc..6a1a62520 100644
--- a/thehive-templates/Msg_Parser_3_0/long.html
+++ b/thehive-templates/Msg_Parser_4_0/long.html
@@ -7,7 +7,6 @@
-
Email message details
@@ -18,44 +17,58 @@
- From
- - {{content.displayFrom}} ({{content.sender}})
+ - {{content.sender}}
- To
- - {{content.displayTo}} ({{content.receivers}})
+ - {{content.receivers}}
- Subject
- {{content.subject || '-'}}
- - Topic
- - {{content.topic || '-'}}
+ - Date
+ - {{content.date || '-'}}
+
+
+ - X-Originating-IP
+ - {{content.XoriginatingIP || '-'}}
+
+
+ - Message-ID
+ - {{content.MessageID || '-'}}
- Bcc
- {{content.bcc || '-'}}
+
- Attachments
-
This message file includes
-
+
-
- | Filename |
- Mime Type |
- Extension |
-
+
+ | Filename |
+ File information |
+
+
-
-
- | {{a.filename}} |
- {{a.mime}} |
- {{a.extension}} |
-
+
+
+ | {{a.filename}} |
+ [MD5]: {{a.md5}} |
+
+
+ | [SHA1]: {{a.sha1}} |
+
+
+ | [SHA256]: {{a.sha256}} |
+
@@ -74,3 +87,4 @@
+
diff --git a/thehive-templates/Msg_Parser_4_0/short.html b/thehive-templates/Msg_Parser_4_0/short.html
new file mode 100644
index 000000000..41a60f314
--- /dev/null
+++ b/thehive-templates/Msg_Parser_4_0/short.html
@@ -0,0 +1,4 @@
+
+ {{t.namespace}}:{{t.predicate}}={{t.value}}