KDD Cup 2015の二値分類問題、コースからの離脱予測に関する実装を整理します。
- データ前処理:NumPyとPandasライブラリを用いた特徴量の数値化処理
#!/usr/bin/env python
# coding=utf-8
import pickle
import pandas as pd
import numpy as np
# 辞書型で値をマッピング
source_mapping={'server':0,'browser':1}
event_mapping = {"problem":5,"video":3,"access":1,"wiki":4,"discussion":6,"navigate":2,"page_close":0}
def create_time_dictionary():
# 日付範囲の生成
date_range = pd.date_range('2013-10-27','2014-08-01')
time_dict = pd.Series(np.arange(len(date_range)),index=date_range)
# 辞書をファイルに保存
with open('data/time_dict.csv','wb') as fw:
pickle.dump(time_dict,fw)
return time_dict
def generate_course_dictionary():
# コースIDの読み込みとマッピング
df = pd.read_csv('data/date.csv',usecols=[0])
course_mapping = pd.factorize(df.course_id)[1]
course_dict = dict(zip(course_mapping,range(len(course_mapping))))
# 辞書をファイルに保存
with open('data/course_id_mapping.csv','wb') as fw:
pickle.dump(course_dict,fw)
print "course_mapping作成完了"
return course_dict
def build_object_dictionary():
# 訓練データのオブジェクトマッピング
df = pd.read_csv('data/log_train.csv',usecols=[4])
obj_mapping = pd.factorize(df.object)[1]
obj_dict = dict(zip(obj_mapping,range(len(obj_mapping))))
# テストデータのオブジェクトマッピング
df2 = pd.read_csv('data/test/log_test.csv',usecols=[4])
obj_mapping2 = pd.factorize(df2.object)[1]
# 訓練データにないオブジェクトの追加
new_objects = [w for w in obj_mapping2 if w not in obj_mapping]
obj_dict2 =dict(zip(new_objects,np.arange(len(obj_mapping),len(obj_mapping)+len(new_objects))))
# 辞書の結合
obj_dict.update(obj_dict2)
# 辞書をファイルに保存
with open('data/object_mapping.csv','wb') as fw:
pickle.dump(obj_dict,fw)
print "object_mapping作成完了"
return obj_dict
def map_time_value(x):
x = x[:10]
return time_dictionary[x]
def map_object_value(x):
return object_dictionary[x]
def map_course_value(x):
return course_dictionary[x]
# 辞書の生成
time_dictionary = create_time_dictionary()
course_dictionary = generate_course_dictionary()
object_dictionary = build_object_dictionary()
def process_log_train_data():
print "訓練ログデータの読み込み"
df1 = pd.read_csv('data/log_train.csv',converters={1:map_time_value,4:map_object_value})
print df1.head()
# ソースとイベントのマッピング
df1.source = df1.source.map(lambda x:source_mapping[x])
df1.event = df1.event.map(lambda x:event_mapping[x])
print df1.head()
print df1.tail()
df1.to_csv('data/log_train_processed.csv',index=False)
def process_course_data():
# 訓練データの処理
df2 = pd.read_csv('data/enrollment_train.csv',usecols=[0,2],converters={2:map_course_value})
df3 = pd.read_csv('data/date.csv',converters={0:map_course_value,1:map_time_value,2:map_time_value})
df4 = pd.merge(df2,df3,on='course_id',how='outer')
df4 = df4.sort_values(by='enrollment_id')
print df4.tail(10)
df4.to_csv("data/course_train_processed.csv",index=False)
# テストデータの処理
df1 = pd.read_csv('data/test/enrollment_test.csv',usecols=[0,2],converters={2:map_course_value})
df4 = pd.merge(df1,df3)
df4 = df4.sort_values(by='enrollment_id')
print df4.tail(10)
df4.to_csv("data/test/course_test_processed.csv",index=False)
def process_log_test_data():
print "テストログデータの読み込み"
df1 = pd.read_csv('data/test/log_test.csv',converters={1:map_time_value,4:map_object_value})
print df1.tail(10)
df1.source = df1.source.map(lambda x:source_mapping[x])
df1.event = df1.event.map(lambda x:event_mapping[x])
print df1.tail(10)
df1.to_csv('data/test/log_test_processed.csv',index=False)
# 前処理の実行
process_log_train_data()
process_log_test_data()
process_course_data()
- さまざまな機械学習手法を用いたモデル構築と予測
#!/usr/bin/env python
# coding=utf-8
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pickle
debug_mode=True
if debug_mode:
sample_size=5000
else:
sample_size = 20000
class DropoutPredictor(object):
course_map = {}
def __init__(self):
print "KDD Cup 2015 コンテストへようこそ"
with open("data/course_mapping.pkl", 'rb') as fr:
self.course_map = pickle.load(fr)
@staticmethod
def convert_date_to_number(s):
return pd.to_datetime(s)
def normalize_result(self,x):
if x<0.0001:
x=0
elif x>0.98:
x=1.0
return x
def normalize_course_id(self,c):
return self.course_map[c]
def load_training_data(self):
# ログデータの読み込みと前処理
df1 = pd.read_csv('./data/log_train.csv',usecols=[0,2,3,4])
df1.source=pd.factorize(df1.source)[0]
df1.event=pd.factorize(df1.event)[0]
df1.object=pd.factorize(df1.object)[0]
# グループ化による集計
gp = df1.groupby("enrollment_id")
gp2 = df1.groupby(["enrollment_id","source"])
# コース情報の読み込み
df2 = pd.read_csv('data/enrollment_train.csv',usecols=[2])
df2.course_id=pd.factorize(df2.course_id)[0]
# 正解ラベルの読み込み
df3 = pd.read_csv('data/truth_train.csv',usecols=[1],names=["dropout"])
# ピボットテーブルによる特徴量生成
feature_data = df1.pivot_table("source",index="enrollment_id",columns="event",aggfunc="count",fill_value=0)
# 追加特徴量の生成
feature_data["browser_count"] = gp2.event.count().unstack()[0]
feature_data["server_count"] = gp2.event.count().unstack()[1]
feature_data["course_id"]=df2.course_id
feature_data["total_actions"] = gp.event.count()
feature_data["object_std"] = gp.object.std()
feature_data["event_var"] =gp.event.var()
feature_data["event_mean"] =gp.event.mean()
feature_data = feature_data.fillna(0)
print feature_data.head()
# 特徴量とラベルの準備
X = feature_data.values
y = np.ravel(df3["dropout"])
return X,y
def load_test_data(self):
# テストデータの読み込みと前処理
df1 = pd.read_csv('data/test/log_test.csv',usecols=[0,2,3,4])
df1.source = pd.factorize(df1.source)[0]
df1.event = pd.factorize(df1.event)[0]
df1.object = pd.factorize(df1.object)[0]
# グループ化による集計
gp = df1.groupby("enrollment_id")
gp2 = df1.groupby(["enrollment_id","source"])
# コース情報の読み込み
df2 = pd.read_csv("data/test/enrollment_test.csv",usecols=[2])
df2.course_id = pd.factorize(df2.course_id)[0]
# ピボットテーブルによる特徴量生成
feature_data = df1.pivot_table("source",index="enrollment_id",columns="event",aggfunc="count",fill_value=0)
# 追加特徴量の生成
feature_data["browser_count"] = gp2.event.count().unstack()[0]
feature_data["server_count"] = gp2.event.count().unstack()[1]
feature_data["course_id"] = df2.course_id
feature_data["total_actions"] = gp.event.count()
feature_data["object_std"] = gp.object.std()
feature_data["event_var"] = gp.event.var()
feature_data["event_mean"] = gp.event.mean()
feature_data = feature_data.fillna(0)
print "テストデータの先頭5件:\n",feature_data.head()
return feature_data.values
def train_gbdt_model(self,x_train,x_test,y_train,y_test,test_data):
# GBDTモデルの構築と学習
model = GradientBoostingClassifier(n_estimators=450,learning_rate=0.1,random_state=20)
model.fit(x_train,y_train)
# 評価
y_pred = model.predict_proba(x_test)[:,1]
score = roc_auc_score(y_test,y_pred)
print "GBDTモデルのスコア: ",score
# 予測
predictions = model.predict_proba(test_data)[:,1]
print "予測結果の先頭5件:", predictions[:5]
self.save_predictions(predictions,"data/test/gbdt_predictions.csv")
def train_svm_model(self,x_train,x_test,y_train,y_test,test_data):
# SVMモデルのハイパーパラメータチューニング
param_grid = [{'kernel':['poly'],'C':[10,500,1200]},
{'kernel':['linear'],'C':[200,500,800]}]
model = GridSearchCV(SVC(probability=True),param_grid,cv=5,scoring="roc_auc")
# モデルの学習
model.fit(x_train,y_train)
print "最適なパラメータ: "
print model.best_params_
# 評価
y_pred = model.predict_proba(x_test)[:,1]
score = roc_auc_score(y_test,y_pred)
print "SVMモデルのスコア...",score
# 予測
predictions = model.predict_proba(test_data)[:,1]
self.save_predictions(predictions,"data/test/svm_predictions"+str(score)+".csv")
return predictions[:5]
def save_predictions(self,predictions,fileName):
# 予測結果の保存
enrollment_test = pd.read_csv('./data/test/enrollment_test.csv',usecols=[0])
enrollment_test['dropout'] = predictions
result = enrollment_test[['enrollment_id','dropout']];
print "***"*30
print result.head()
result.to_csv(fileName,index=False,header=False)
def predict_dropout(self):
# 予測パイプラインの実行
print "訓練データの読み込み..."
X,y = self.load_training_data()
# データの分割
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.23,random_state=20)
print "テストデータの読み込み..."
test_data = self.loadTestData()
# GBDTモデルの構築
print "GBDTモデルの構築..."
self.train_gbdt_model(x_train,x_test,y_train,y_test,test_data)
# SVMモデルの構築
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=28)
print "SVMモデルの構築..."
self.train_svm_model(x_train,x_test,y_train,y_test,test_data)
def ensemble_results(self):
# アンサンブルによる最終予測
print "アンサンブルによる予測..."
df_gbdt = pd.read_csv("data/test/gbdt_predictions.csv",header=None,names=["id","dropout1"])
df_svm = pd.read_csv("data/test/svm_predictions.csv",header=None,usecols=[1],names=["id","dropout2"])
# 最終予測の計算
df = pd.concat([df_gbdt,df_svm],axis=1)
df["dropout"] = df.dropout1*0.7+df.dropout2*0.3
df["dropout"] = map(lambda x:self.normalize_result(x),df["dropout"])
print df.head()
# 不要な列の削除と保存
df.drop(["dropout1","dropout2"],axis=1,inplace=True)
print df.head()
df.to_csv("data/test/final_predictions.csv",header=False,index=False)
if __name__ == '__main__':
predictor = DropoutPredictor()
predictor.predict_dropout()
predictor.ensemble_results()
print "処理完了" # 精度、再現率、AUC値は約84%程度
- 特徴量エンジニアリングの深化とAUC値の向上(約89%)
#!/usr/bin/env python
# coding=utf-8
import numpy as np
import pandas as pd
import pickle as cPickle
from sklearn import svm
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
def normalize_predictions( x ):
if x<0.000001:
x=0
elif x>0.96:
x=1
return x
def calculate_duration(x):
return x.max()-x.min()
def load_training_features():
# 訓練データの読み込み
df1 = pd.read_csv('data/log_train_processed.csv')
print df1.head()
print df1.tail()
# 正解ラベルの読み込み
df2 = pd.read_csv('data/truth_train.csv',header=None,usecols=[1],names=["dropout"])
# コース情報の読み込み
df3 = pd.read_csv('data/course_train_processed.csv',usecols=[1,2,3])
# グループ化による特徴量生成
grouped_data = df1.groupby("enrollment_id")
# 基本的な特徴量: イベントタイプごとのカウント
base_features = df1.pivot_table("source",index='enrollment_id',columns="event",aggfunc='count',fill_value=0)
# イベントに関する統計量
event_stats = grouped_data.event.describe().unstack()
# 時間に関する統計量
time_stats = grouped_data.time.describe().unstack()
time_stats = time_stats.drop('count',axis=1)
# ソースに関する統計量
source_stats = grouped_data.source.describe().unstack()
source_stats = source_stats.drop(['count','min','max'],axis=1)
# オブジェクトに関する統計量
object_stats = grouped_data.object.describe().unstack()
object_stats = object_stats.drop(['count'],axis=1)
# 特徴量の結合
features = pd.concat([base_features,event_stats],axis=1)
features = pd.concat([features,time_stats],axis=1)
features = pd.concat([features,source_stats],axis=1)
features = pd.concat([features,object_stats],axis=1)
# コース固有の特徴量: 持続期間、ID、開始日、終了日
features['duration'] = grouped_data.time.apply(calculate_duration)
features["course_id"] = df3["course_id"].values
features["start_date"] = df3["from"].values
features["end_date"] = df3["to"].values
# 特徴量の前処理と保存
print "元の特徴量データ: "
print features.tail()
features = features.fillna(0)
features.to_csv('data/train_features.csv',index=False)
# 特徴量の正規化
X = features.values
X = scale(X)
# ラベルの準備
y = np.ravel(df2['dropout'])
print "ラベルの例: ",y[:5]
return X,y
def load_test_features():
# テストデータの読み込み
df1 = pd.read_csv('data/test/log_test_processed.csv')
print df1.head()
# コース情報の読み込み
df3 = pd.read_csv('data/test/course_test_processed.csv',usecols=[1,2,3])
# グループ化による特徴量生成
grouped_data = df1.groupby("enrollment_id")
# 基本的な特徴量: イベントタイプごとのカウント
base_features = df1.pivot_table("source",index='enrollment_id',columns="event",aggfunc='count',fill_value=0)
# イベントに関する統計量
event_stats = grouped_data.event.describe().unstack()
# 時間に関する統計量
time_stats = grouped_data.time.describe().unstack()
time_stats = time_stats.drop('count',axis=1)
# ソースに関する統計量
source_stats = grouped_data.source.describe().unstack()
source_stats = source_stats.drop(['count','min','max'],axis=1)
# オブジェクトに関する統計量
object_stats = grouped_data.object.describe().unstack()
object_stats = object_stats.drop(['count'],axis=1)
# 特徴量の結合
features = pd.concat([base_features,event_stats],axis=1)
features = pd.concat([features,time_stats],axis=1)
features = pd.concat([features,source_stats],axis=1)
features = pd.concat([features,object_stats],axis=1)
# コース固有の特徴量
features['duration'] = grouped_data.time.apply(calculate_duration)
features["course_id"] = df3["course_id"].values
features["start_date"] = df3["from"].values
features["end_date"] = df3["to"].values
# 特徴量の前処理と保存
print "テスト特徴量データ: "
print features.tail(10)
features = features.fillna(0)
features.to_csv('data/test/test_features.csv',index=False)
# テストデータの準備
test_features = features.values
test_features = scale(test_features)
return test_features
def train_svm_model(x_train,x_test,y_train,y_test,test_features):
# SVMモデルの構築と学習
model = svm.SVC(kernel='linear',probability=True,random_state=42)
model.fit(x_train,y_train)
# 評価
predictions = model.predict_proba(x_test)[:,1]
score = roc_auc_score(y_test,predictions)
print "SVMモデルのスコア:...",score
# 予測結果の保存
save_predictions(predictions,'data/test/svm_predictions.csv')
def train_logistic_regression(x_train,x_test,y_train,y_test,test_features):
# ロジスティック回帰モデルの構築と学習
model = linear_model.LogisticRegression()
model.fit(x_train,y_train)
# 評価
predictions = model.predict_proba(x_test)[:,1]
score = roc_auc_score(y_test,predictions)
print "ロジスティック回帰モデルのスコア: ",score
# 予測値の正規化
normalized_predictions = map(normalize_predictions,predictions)
normalized_score = roc_auc_score(y_test,normalized_predictions)
print "正規化後のスコア ... ",normalized_score
# 予測結果の保存
test_predictions = model.predict_proba(test_features)[:,1]
save_predictions(test_predictions,'data/test/lr_predictions.csv')
def train_random_forest(x_train,x_test,y_train,y_test,test_features):
# ランダムフォレストモデルの構築と学習
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train,y_train) # 修正: x_train, x_train -> x_train, y_train
predictions = model.predict_proba(x_test)[:,1]
score = roc_auc_score(y_test,predictions)
# 予測
test_predictions = model.predict(test_features)[:,1]
print "ランダムフォレストモデルのスコア: ",score
save_predictions(test_predictions,'./data/test/rf_predictions.csv')
def train_gradient_boosting(x_train,x_test,y_train,y_test,test_features):
# 勾配ブースティングモデルの構築と学習
model = GradientBoostingClassifier(n_estimators=500)
model.fit(x_train,y_train)
# 評価
predictions = model.predict_proba(x_test)[:,1]
score = roc_auc_score(y_test,predictions)
# 予測
test_predictions = model.predict_proba(test_features)[:,1]
print "勾配ブースティングモデルのスコア: ",score
save_predictions(test_predictions,'data/test/gbdt_predictions'+str(score)+'.csv')
def save_predictions(predictions,fileName):
# 予測結果の保存
df = pd.read_csv('data/test/enrollment_test.csv',usecols=[0])
df['dropout'] = predictions
print df.head()
df.to_csv(fileName,index=False,header=False)
# 複数のモデルの結果をアンサンブル
def ensemble_predictions():
# 複数のモデルの結果を読み込み
base_df = pd.read_csv("data/test/gbdt_predictions.csv",header=None,names=["id","dropout"])
model1_df = pd.read_csv("data/test/gbdt_predictions0.875919444048.csv",header=None,usecols=[1],names=["dropout1"])
model2_df = pd.read_csv("data/test/final_predictions.csv",header=None,usecols=[1],names=["dropout2"])
# 重み付き平均によるアンサンブル
base_df["dropout"] =base_df["dropout"]*0.5+ model1_df["dropout1"]*0.2+model2_df["dropout2"]*0.3
base_df["dropout"] = base_df["dropout"]
base_df.to_csv("data/test/final_predictions.csv",index=None,header=None)
# 前処理済みデータの読み込み(高速化用)
def load_preprocessed_training_data():
df1 = pd.read_csv('data/train_features.csv')
print df1.head()
X = df1.values
# X = scale(X) # 必要に応じてコメント解除
with open("data/train_labels.pkl", 'rb') as fr2:
y = pickle.load(fr2)
return X,y
def load_preprocessed_test_data():
df1 = pd.read_csv('data/test/test_features.csv')
test_features = df1.values
# test_features = scale(test_features) # 必要に応じてコメント解除
return test_features
def run_prediction_pipeline():
# アンサンブル予測の実行
ensemble_predictions()
print "訓練データの読み込み..."
X,y = load_preprocessed_training_data()
print "テストデータの読み込み..."
test_features = load_preprocessed_test_data()
# ロジスティック回帰モデルの実行
print "\nロジスティック回帰モデルの実行..."
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.31,random_state=148)
train_logistic_regression(x_train,x_test,y_train,y_test,test_features)
# ランダムフォレストモデルの実行
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.28,random_state=151)
print "\nランダムフォレストモデルの実行..."
# train_random_forest(x_train,x_test,y_train,y_test,test_features) # 必要に応じてコメント解除
# 勾配ブースティングモデルの実行
print "\n勾配ブースティングモデルの実行..."
train_gradient_boosting(x_train,x_test,y_train,y_test,test_features)
# SVMモデルの実行
print "\nSVMモデルの実行..."
train_svm_model(x_train,x_test,y_train,y_test,test_features)
if __name__ =="__main__":
print "処理開始>>>"
run_prediction_pipeline()
print "処理完了"