我最近了解了自定义转换器,并正在尝试将其实现为NLP管道的类别编码器。
转换器接收功能并按照下表所示的方式对其进行编码。
请帮助我以更Python化的方式编写此代码。我想我不应该在转换函数中使用y,如何解决这个问题?
#Custom Transformer that returns passed column as response coding and deletes the original column
from sklearn.base import BaseEstimator,TransformerMixin
class response_coding_transformer( BaseEstimator,TransformerMixin ):
#Class Constructor
def __init__( self,feature,y):
self._feature = feature
self._y = y
#https://github.com/Lalit-Yadav-E1483/ml_case_studies/blob/7da02bdef5512c6ab636727a2a589072b0e2d515/cancer_diagnosis/initial_analysis.ipynb
def response_table(alpha,X,y):
value_count = X[feature].value_counts()
gv_dict = dict()
y_series = y[y.columns[0]]
for i,denominator in value_count.items():
vec = []
for k in range(y_series.unique().size):
cls_cnt = X.loc[(y_series==k) & (X[feature]==i)]
vec.append((cls_cnt.shape[0] + alpha*10)/ (denominator + 90*alpha))
gv_dict[i]=vec
return gv_dict,y_series,value_count
def response_coding(alpha,y):
gv_dict,value_count = response_table(alpha,y)
gv_fea = []
for index,row in X.iterrows():
if row[feature] in dict(value_count).keys():
gv_fea.append(gv_dict[row[feature]])
else:
gv_fea.append([1/y_series.unique().size for _ in range(y_series.unique().size)])
return np.array(gv_fea)
def rc_df(response_array,feature):
response_coded_df = pd.DataFrame()
for i in range(response_array.shape[1]):
name = feature + '_' + str(i)
response_coded_df[name] = response_array[:,i]
return response_coded_df
#Return self nothing else to do here
def fit( self,y = None ):
return self
#Method that describes what we need this transformer to do
def transform( self,X):
temp = pd.DataFrame()
for feature in self._feature:
alpha = 0
response_array = response_coding(alpha,self._y)
response_coded_df = rc_df(response_array,feature)
temp = pd.concat([temp,response_coded_df],axis=1)
new_x = X.drop(self._feature,axis = 1)
return pd.concat([new_x,temp],axis=1)
这是管道的方法
#https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
#https://stackoverflow.com/a/54704747/9292995
def Donors_choose_pipe(data,model,text_transformer):
numeric_features = ['teacher_number_of_previously_posted_projects','price']
numeric_transformer = Pipeline(steps=[
('scaler',MinmaxScaler())])
categorical_features = ['school_state','teacher_prefix','project_grade_category','clean_categories','clean_subcategories']
categorical_transformer = Pipeline(steps=[
('resp',response_coding_transformer(categorical_features))])
# text_features = ['essay','project_title']
text_transformer = Pipeline(steps=[
('trans',text_transformer)])
preprocessor = ColumnTransformer(
transformers=[('num',numeric_transformer,numeric_features),('cat',categorical_transformer,categorical_features),('essay',text_transformer,"essay"),# ('title',"project_title")
],n_jobs=-1,verbose=True,remainder = 'passthrough'
)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
pipeline = Pipeline(steps=[('preprocessor',preprocessor),('classifier',model)])
y = data['project_is_approved'].values
X = data.drop(['project_is_approved'],axis=1)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,stratify=y)
pipeline.fit(X_train,y_train)
print("model score: %.3f" % pipeline.score(X_test,y_test))
return pipeline,X_train,y_test
text_transformer = TfidfVectorizer(stop_words = 'english',min_df = 10)
model = XGBClassifier()
clf,y_test = Donors_choose_pipe(data,text_transformer)
这里是数据标题供参考
当我将y传递给转换器时,该代码似乎运行良好,但是我不知道如何在管道中进行操作。