ython2 to Python3

Coldwave96 · Coldwave96 · commit b467a46a976d · 2020-09-10T11:51:08.000+08:00
diff --git a/check.py b/check.py
@@ -1,25 +1,43 @@
 import os
-import re
-import numpy as np
 
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfTransformer
 
 import pickle
+import chardet
+import json
+
+import warnings
+warnings.filterwarnings('ignore')
 
 max_features = 50000
 
 check_dir = "Data/check/"
 
+
+def check_style(filepath):
+    with open(filepath, mode='rb') as f:
+        data = f.read()
+        style = chardet.detect(data)['encoding']
+        return style
+
+
 def load_str(filepath):
     t = ""
-    with open(filepath) as f:
-        for line in f:
-            line = line.strip('\r')
-            line = line.strip('\n')
-            t += line
+    try:
+        style = check_style(filepath)
+        with open(filepath, encoding=style) as f:
+            for line in f:
+                line = line.strip('\r')
+                line = line.strip('\n')
+                t += line
+    except UnicodeDecodeError:
+        with open(filepath, mode='rb') as f:
+            t = f.read()
+
     return t
 
+
 def check_webshell(dir):
     all = 0
     all_php = 0
@@ -55,28 +73,28 @@ def check_webshell(dir):
     g = os.walk(dir)
     for path, d, filelist in g:
         for filename in filelist:
-            fulpath=os.path.join(path, filename)
+            fulpath = os.path.join(path, filename)
             all += 1
-            if filename.endswith('.php'):
+            if filename.endswith('.php') or filename.endswith('.txt'):
                 all_php += 1
                 t = load_str(fulpath)
-                t_list=[]
+                t_list = []
                 t_list.append(t)
                 x = CV1.transform(t_list).toarray()
                 x = transformer.fit_transform(x).toarray()
                 y_pred = clf1.predict(x)
-            elif filename.endswith('.asp'):
+            elif filename.endswith('.asp') or filename.endswith('.txt'):
                 all_asp += 1
                 t = load_str(fulpath)
-                t_list=[]
+                t_list = []
                 t_list.append(t)
                 x = CV2.transform(t_list).toarray()
                 x = transformer.fit_transform(x).toarray()
                 y_pred = clf2.predict(x)
-            elif filename.endswith('.jsp'):
+            elif filename.endswith('.jsp') or filename.endswith('.txt'):
                 all_jsp += 1
                 t = load_str(fulpath)
-                t_list=[]
+                t_list = []
                 t_list.append(t)
                 x = CV3.transform(t_list).toarray()
                 x = transformer.fit_transform(x).toarray()
@@ -85,10 +103,11 @@ def check_webshell(dir):
                 other += 1
 
             if y_pred[0] == 1:
-                print "%s may be a webshell file" % fulpath
                 webshell += 1
 
-    print "Scan %d files(%d php files, %d asp files, %d jsp files, %d other files),%d files is webshell" % (all, all_php, all_asp, all_jsp, other, webshell)
+            print (json.dumps({'filename': filename, 'result': int(y_pred[0])}, sort_keys=True, indent=4, separators=(',', ': ')))
+
+    print ("Scan %d files(%d php files, %d asp files, %d jsp files, %d other files),%d files is webshell" % (all, all_php, all_asp, all_jsp, other, webshell))
 
 if __name__ == '__main__':
 
diff --git a/train_asp.py b/train_asp.py
@@ -1,6 +1,4 @@
-import re
 import os
-import numpy as np
 
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.model_selection import train_test_split
@@ -9,6 +7,7 @@
 from sklearn.feature_extraction.text import TfidfTransformer
 
 import pickle
+import chardet
 
 max_features = 50000
 
@@ -18,34 +17,48 @@
 white_count = 0
 black_count = 0
 
+
+def check_style(filepath):
+    with open(filepath, mode='rb') as f:
+        data = f.read()
+        style = chardet.detect(data)['encoding']
+        return style
+
+
 def load_str(filepath):
     t = ""
-    with open(filepath) as f:
-        for line in f:
-            line = line.strip('\r')
-            line = line.strip('\n')
-            t += line
+    try:
+        style = check_style(filepath)
+        with open(filepath, encoding=style) as f:
+            for line in f:
+                line = line.strip('\r')
+                line = line.strip('\n')
+                t += line
+    except UnicodeDecodeError:
+        with open(filepath, mode='rb') as f:
+            t = f.read()
+
     return t
 
+
 def load_files(dir):
     files_list = []
     g = os.walk(dir)
     for path, d, filelist in g:
         for filename in filelist:
-            if filename.endswith('.asp') or filename.endswith('.txt'):
+            if filename.endswith('.asp'):
                 fulpath = os.path.join(path, filename)
-                print "Load %s" % fulpath
+                print ("Load %s" % fulpath)
                 t = load_str(fulpath)
                 files_list.append(t)
     return files_list
 
+
 def get_feature_by_wordbag_tfidf():
     global max_features
     global white_count
     global black_count
-    print "max_features = %d" % max_features
-    x = []
-    y = []
+    print ("max_features = %d" % max_features)
 
     webshell_files_list = load_files(webshell_dir)
     y1 = [1] * len(webshell_files_list)
@@ -71,15 +84,17 @@ def get_feature_by_wordbag_tfidf():
 
     return x, y
 
+
 def do_metrics(y_test, y_pred):
-    print "metrics.accuracy_score:"
-    print metrics.accuracy_score(y_test, y_pred)
-    print "metrics.confusion_matrix:"
-    print metrics.confusion_matrix(y_test, y_pred)
-    print "metrics.precision_score:"
-    print metrics.precision_score(y_test, y_pred)
-    print "metrics.recall_score:"
-    print metrics.recall_score(y_test, y_pred)
+    print ("metrics.accuracy_score:")
+    print (metrics.accuracy_score(y_test, y_pred))
+    print ("metrics.confusion_matrix:")
+    print (metrics.confusion_matrix(y_test, y_pred))
+    print ("metrics.precision_score:")
+    print (metrics.precision_score(y_test, y_pred))
+    print ("metrics.recall_score:")
+    print (metrics.recall_score(y_test, y_pred))
+
 
 def do_GNB(x, y):
     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
@@ -92,10 +107,11 @@ def do_GNB(x, y):
 
     do_metrics(y_test, y_pred)
 
+
 if __name__ == '__main__':
 
     x, y = get_feature_by_wordbag_tfidf()
-    print "Load %d white files %d black files" % (white_count, black_count)
+    print ("Load %d white files %d black files" % (white_count, black_count))
 
     do_GNB(x, y)
 
diff --git a/train_jsp.py b/train_jsp.py
@@ -1,6 +1,4 @@
-import re
 import os
-import numpy as np
 
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.model_selection import train_test_split
@@ -9,6 +7,7 @@
 from sklearn.feature_extraction.text import TfidfTransformer
 
 import pickle
+import chardet
 
 max_features = 50000
 
@@ -18,34 +17,47 @@
 white_count = 0
 black_count = 0
 
+
+def check_style(filepath):
+    with open(filepath, mode='rb') as f:
+        data = f.read()
+        style = chardet.detect(data)['encoding']
+        return style
+
+
 def load_str(filepath):
     t = ""
-    with open(filepath) as f:
-        for line in f:
-            line = line.strip('\r')
-            line = line.strip('\n')
-            t += line
+    try:
+        style = check_style(filepath)
+        with open(filepath, encoding=style) as f:
+            for line in f:
+                line = line.strip('\r')
+                line = line.strip('\n')
+                t += line
+    except UnicodeDecodeError:
+        with open(filepath, mode='rb') as f:
+            t = f.read()
+
     return t
 
 def load_files(dir):
     files_list = []
     g = os.walk(dir)
     for path, d, filelist in g:
         for filename in filelist:
-            if filename.endswith('.jsp') or filename.endswith('txt'):
+            if filename.endswith('.jsp'):
                 fulpath = os.path.join(path, filename)
-                print "Load %s" % fulpath
+                print ("Load %s" % fulpath)
                 t = load_str(fulpath)
                 files_list.append(t)
     return files_list
 
+
 def get_feature_by_wordbag_tfidf():
     global max_features
     global white_count
     global black_count
-    print "max_features = %d" % max_features
-    x = []
-    y = []
+    print ("max_features = %d" % max_features)
 
     webshell_files_list = load_files(webshell_dir)
     y1 = [1] * len(webshell_files_list)
@@ -71,15 +83,17 @@ def get_feature_by_wordbag_tfidf():
 
     return x, y
 
+
 def do_metrics(y_test, y_pred):
-    print "metrics.accuracy_score:"
-    print metrics.accuracy_score(y_test, y_pred)
-    print "metrics.confusion_matrix:"
-    print metrics.confusion_matrix(y_test, y_pred)
-    print "metrics.precision_score:"
-    print metrics.precision_score(y_test, y_pred)
-    print "metrics.recall_score:"
-    print metrics.recall_score(y_test, y_pred)
+    print ("metrics.accuracy_score:")
+    print (metrics.accuracy_score(y_test, y_pred))
+    print ("metrics.confusion_matrix:")
+    print (metrics.confusion_matrix(y_test, y_pred))
+    print ("metrics.precision_score:")
+    print (metrics.precision_score(y_test, y_pred))
+    print ("metrics.recall_score:")
+    print (metrics.recall_score(y_test, y_pred))
+
 
 def do_GNB(x, y):
     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
@@ -92,10 +106,11 @@ def do_GNB(x, y):
 
     do_metrics(y_test, y_pred)
 
+
 if __name__ == '__main__':
 
     x, y = get_feature_by_wordbag_tfidf()
-    print "Load %d white files %d black files" % (white_count, black_count)
+    print ("Load %d white files %d black files" % (white_count, black_count))
 
     do_GNB(x, y)
 
diff --git a/train_php.py b/train_php.py