-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathworkshop.py
More file actions
101 lines (81 loc) · 3.81 KB
/
workshop.py
File metadata and controls
101 lines (81 loc) · 3.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from operator import itemgetter
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
def boxplot_sorted(df: pd.DataFrame, by, column, vert=False, patch_artist=True, **kwds):
df2 = pd.DataFrame({col:vals[column] for col, vals in df.groupby(by)})
meds = df2.median().sort_values()
df2[meds.index].boxplot(vert=vert, patch_artist=patch_artist, **kwds)
def show_most_informative_features(model, vectorizer=None, classifier=None, n=20):
# Source: https://bbengfort.github.io/tutorials/2016/05/19/text-classification-nltk-sckit-learn.html
vectorizer = vectorizer or model.steps[0][1]
classifier = classifier or model.steps[-1][1]
# Check to make sure that we can perform this computation
if not hasattr(classifier, 'coef_'):
raise TypeError(
"Cannot compute most informative features on {}.".format(
classifier.__class__.__name__))
tvec = classifier.coef_
# Zip the feature names with the coefs and sort
coefs = sorted(
zip(tvec[0], get_feature_names(vectorizer)),
key=itemgetter(0), reverse=True)
# Get the top n and bottom n coef, name pairs
print("Most Postive")
for score, label in coefs[:n]:
print(f"{score:.2f}\t{label}")
print("\nLeast Positive")
for score, label in coefs[:-(n+1):-1]:
print(f"{score:.2f}\t{label}")
def get_feature_names(column_transformer):
# Source: https://johaupt.github.io/scikit-learn/tutorial/python/data%20processing/ml%20pipeline/model%20interpretation/columnTransformer_feature_names.html
"""Get feature names from all transformers.
Returns
-------
feature_names : list of strings
Names of the features produced by transform.
"""
# Turn loopkup into function for better handling with pipeline later
def get_names(trans):
# >> Original get_feature_names() method
if trans == 'drop' or (
hasattr(column, '__len__') and not len(column)):
return []
if trans == 'passthrough':
if hasattr(column_transformer, '_df_columns'):
if ((not isinstance(column, slice))
and all(isinstance(col, str) for col in column)):
return column
else:
return column_transformer._df_columns[column]
else:
indices = np.arange(column_transformer._n_features)
return ['x%d' % i for i in indices[column]]
if not hasattr(trans, 'get_feature_names'):
# >>> Change: Return input column names if no method avaiable
# For transformers without a get_features_names method, use the input
# names to the column transformer
if column is None:
return []
else:
return [name + "__" + f for f in column]
return [name + "__" + f for f in trans.get_feature_names()]
### Start of processing
feature_names = []
# Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
if type(column_transformer) == Pipeline:
l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
else:
# For column transformers, follow the original method
l_transformers = list(column_transformer._iter(fitted=True))
for name, trans, column, _ in l_transformers:
if type(trans) == Pipeline:
# Recursive call on pipeline
_names = get_feature_names(trans)
# if pipeline has no transformer that returns names
if len(_names)==0:
_names = [name + "__" + f for f in column]
feature_names.extend(_names)
else:
feature_names.extend(get_names(trans))
return feature_names