Skip to content

Commit b467a46

Browse files
committed
ython2 to Python3
1 parent 4150322 commit b467a46

4 files changed

Lines changed: 145 additions & 79 deletions

File tree

check.py

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,43 @@
11
import os
2-
import re
3-
import numpy as np
42

53
from sklearn.feature_extraction.text import CountVectorizer
64
from sklearn.feature_extraction.text import TfidfTransformer
75

86
import pickle
7+
import chardet
8+
import json
9+
10+
import warnings
11+
warnings.filterwarnings('ignore')
912

1013
max_features = 50000
1114

1215
check_dir = "Data/check/"
1316

17+
18+
def check_style(filepath):
19+
with open(filepath, mode='rb') as f:
20+
data = f.read()
21+
style = chardet.detect(data)['encoding']
22+
return style
23+
24+
1425
def load_str(filepath):
1526
t = ""
16-
with open(filepath) as f:
17-
for line in f:
18-
line = line.strip('\r')
19-
line = line.strip('\n')
20-
t += line
27+
try:
28+
style = check_style(filepath)
29+
with open(filepath, encoding=style) as f:
30+
for line in f:
31+
line = line.strip('\r')
32+
line = line.strip('\n')
33+
t += line
34+
except UnicodeDecodeError:
35+
with open(filepath, mode='rb') as f:
36+
t = f.read()
37+
2138
return t
2239

40+
2341
def check_webshell(dir):
2442
all = 0
2543
all_php = 0
@@ -55,28 +73,28 @@ def check_webshell(dir):
5573
g = os.walk(dir)
5674
for path, d, filelist in g:
5775
for filename in filelist:
58-
fulpath=os.path.join(path, filename)
76+
fulpath = os.path.join(path, filename)
5977
all += 1
60-
if filename.endswith('.php'):
78+
if filename.endswith('.php') or filename.endswith('.txt'):
6179
all_php += 1
6280
t = load_str(fulpath)
63-
t_list=[]
81+
t_list = []
6482
t_list.append(t)
6583
x = CV1.transform(t_list).toarray()
6684
x = transformer.fit_transform(x).toarray()
6785
y_pred = clf1.predict(x)
68-
elif filename.endswith('.asp'):
86+
elif filename.endswith('.asp') or filename.endswith('.txt'):
6987
all_asp += 1
7088
t = load_str(fulpath)
71-
t_list=[]
89+
t_list = []
7290
t_list.append(t)
7391
x = CV2.transform(t_list).toarray()
7492
x = transformer.fit_transform(x).toarray()
7593
y_pred = clf2.predict(x)
76-
elif filename.endswith('.jsp'):
94+
elif filename.endswith('.jsp') or filename.endswith('.txt'):
7795
all_jsp += 1
7896
t = load_str(fulpath)
79-
t_list=[]
97+
t_list = []
8098
t_list.append(t)
8199
x = CV3.transform(t_list).toarray()
82100
x = transformer.fit_transform(x).toarray()
@@ -85,10 +103,11 @@ def check_webshell(dir):
85103
other += 1
86104

87105
if y_pred[0] == 1:
88-
print "%s may be a webshell file" % fulpath
89106
webshell += 1
90107

91-
print "Scan %d files(%d php files, %d asp files, %d jsp files, %d other files),%d files is webshell" % (all, all_php, all_asp, all_jsp, other, webshell)
108+
print (json.dumps({'filename': filename, 'result': int(y_pred[0])}, sort_keys=True, indent=4, separators=(',', ': ')))
109+
110+
print ("Scan %d files(%d php files, %d asp files, %d jsp files, %d other files),%d files is webshell" % (all, all_php, all_asp, all_jsp, other, webshell))
92111

93112
if __name__ == '__main__':
94113

train_asp.py

Lines changed: 37 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
import re
21
import os
3-
import numpy as np
42

53
from sklearn.feature_extraction.text import CountVectorizer
64
from sklearn.model_selection import train_test_split
@@ -9,6 +7,7 @@
97
from sklearn.feature_extraction.text import TfidfTransformer
108

119
import pickle
10+
import chardet
1211

1312
max_features = 50000
1413

@@ -18,34 +17,48 @@
1817
white_count = 0
1918
black_count = 0
2019

20+
21+
def check_style(filepath):
22+
with open(filepath, mode='rb') as f:
23+
data = f.read()
24+
style = chardet.detect(data)['encoding']
25+
return style
26+
27+
2128
def load_str(filepath):
2229
t = ""
23-
with open(filepath) as f:
24-
for line in f:
25-
line = line.strip('\r')
26-
line = line.strip('\n')
27-
t += line
30+
try:
31+
style = check_style(filepath)
32+
with open(filepath, encoding=style) as f:
33+
for line in f:
34+
line = line.strip('\r')
35+
line = line.strip('\n')
36+
t += line
37+
except UnicodeDecodeError:
38+
with open(filepath, mode='rb') as f:
39+
t = f.read()
40+
2841
return t
2942

43+
3044
def load_files(dir):
3145
files_list = []
3246
g = os.walk(dir)
3347
for path, d, filelist in g:
3448
for filename in filelist:
35-
if filename.endswith('.asp') or filename.endswith('.txt'):
49+
if filename.endswith('.asp'):
3650
fulpath = os.path.join(path, filename)
37-
print "Load %s" % fulpath
51+
print ("Load %s" % fulpath)
3852
t = load_str(fulpath)
3953
files_list.append(t)
4054
return files_list
4155

56+
4257
def get_feature_by_wordbag_tfidf():
4358
global max_features
4459
global white_count
4560
global black_count
46-
print "max_features = %d" % max_features
47-
x = []
48-
y = []
61+
print ("max_features = %d" % max_features)
4962

5063
webshell_files_list = load_files(webshell_dir)
5164
y1 = [1] * len(webshell_files_list)
@@ -71,15 +84,17 @@ def get_feature_by_wordbag_tfidf():
7184

7285
return x, y
7386

87+
7488
def do_metrics(y_test, y_pred):
75-
print "metrics.accuracy_score:"
76-
print metrics.accuracy_score(y_test, y_pred)
77-
print "metrics.confusion_matrix:"
78-
print metrics.confusion_matrix(y_test, y_pred)
79-
print "metrics.precision_score:"
80-
print metrics.precision_score(y_test, y_pred)
81-
print "metrics.recall_score:"
82-
print metrics.recall_score(y_test, y_pred)
89+
print ("metrics.accuracy_score:")
90+
print (metrics.accuracy_score(y_test, y_pred))
91+
print ("metrics.confusion_matrix:")
92+
print (metrics.confusion_matrix(y_test, y_pred))
93+
print ("metrics.precision_score:")
94+
print (metrics.precision_score(y_test, y_pred))
95+
print ("metrics.recall_score:")
96+
print (metrics.recall_score(y_test, y_pred))
97+
8398

8499
def do_GNB(x, y):
85100
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
@@ -92,10 +107,11 @@ def do_GNB(x, y):
92107

93108
do_metrics(y_test, y_pred)
94109

110+
95111
if __name__ == '__main__':
96112

97113
x, y = get_feature_by_wordbag_tfidf()
98-
print "Load %d white files %d black files" % (white_count, black_count)
114+
print ("Load %d white files %d black files" % (white_count, black_count))
99115

100116
do_GNB(x, y)
101117

train_jsp.py

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
import re
21
import os
3-
import numpy as np
42

53
from sklearn.feature_extraction.text import CountVectorizer
64
from sklearn.model_selection import train_test_split
@@ -9,6 +7,7 @@
97
from sklearn.feature_extraction.text import TfidfTransformer
108

119
import pickle
10+
import chardet
1211

1312
max_features = 50000
1413

@@ -18,34 +17,47 @@
1817
white_count = 0
1918
black_count = 0
2019

20+
21+
def check_style(filepath):
22+
with open(filepath, mode='rb') as f:
23+
data = f.read()
24+
style = chardet.detect(data)['encoding']
25+
return style
26+
27+
2128
def load_str(filepath):
2229
t = ""
23-
with open(filepath) as f:
24-
for line in f:
25-
line = line.strip('\r')
26-
line = line.strip('\n')
27-
t += line
30+
try:
31+
style = check_style(filepath)
32+
with open(filepath, encoding=style) as f:
33+
for line in f:
34+
line = line.strip('\r')
35+
line = line.strip('\n')
36+
t += line
37+
except UnicodeDecodeError:
38+
with open(filepath, mode='rb') as f:
39+
t = f.read()
40+
2841
return t
2942

3043
def load_files(dir):
3144
files_list = []
3245
g = os.walk(dir)
3346
for path, d, filelist in g:
3447
for filename in filelist:
35-
if filename.endswith('.jsp') or filename.endswith('txt'):
48+
if filename.endswith('.jsp'):
3649
fulpath = os.path.join(path, filename)
37-
print "Load %s" % fulpath
50+
print ("Load %s" % fulpath)
3851
t = load_str(fulpath)
3952
files_list.append(t)
4053
return files_list
4154

55+
4256
def get_feature_by_wordbag_tfidf():
4357
global max_features
4458
global white_count
4559
global black_count
46-
print "max_features = %d" % max_features
47-
x = []
48-
y = []
60+
print ("max_features = %d" % max_features)
4961

5062
webshell_files_list = load_files(webshell_dir)
5163
y1 = [1] * len(webshell_files_list)
@@ -71,15 +83,17 @@ def get_feature_by_wordbag_tfidf():
7183

7284
return x, y
7385

86+
7487
def do_metrics(y_test, y_pred):
75-
print "metrics.accuracy_score:"
76-
print metrics.accuracy_score(y_test, y_pred)
77-
print "metrics.confusion_matrix:"
78-
print metrics.confusion_matrix(y_test, y_pred)
79-
print "metrics.precision_score:"
80-
print metrics.precision_score(y_test, y_pred)
81-
print "metrics.recall_score:"
82-
print metrics.recall_score(y_test, y_pred)
88+
print ("metrics.accuracy_score:")
89+
print (metrics.accuracy_score(y_test, y_pred))
90+
print ("metrics.confusion_matrix:")
91+
print (metrics.confusion_matrix(y_test, y_pred))
92+
print ("metrics.precision_score:")
93+
print (metrics.precision_score(y_test, y_pred))
94+
print ("metrics.recall_score:")
95+
print (metrics.recall_score(y_test, y_pred))
96+
8397

8498
def do_GNB(x, y):
8599
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
@@ -92,10 +106,11 @@ def do_GNB(x, y):
92106

93107
do_metrics(y_test, y_pred)
94108

109+
95110
if __name__ == '__main__':
96111

97112
x, y = get_feature_by_wordbag_tfidf()
98-
print "Load %d white files %d black files" % (white_count, black_count)
113+
print ("Load %d white files %d black files" % (white_count, black_count))
99114

100115
do_GNB(x, y)
101116

0 commit comments

Comments
 (0)