KDD Cup 2015 コース離脱予測モデルの開発

KDD Cup 2015の二値分類問題、コースからの離脱予測に関する実装を整理します。

  1. データ前処理:NumPyとPandasライブラリを用いた特徴量の数値化処理
#!/usr/bin/env python
# coding=utf-8

import pickle
import pandas as pd
import numpy as np

# 辞書型で値をマッピング
source_mapping={'server':0,'browser':1}
event_mapping = {"problem":5,"video":3,"access":1,"wiki":4,"discussion":6,"navigate":2,"page_close":0}


def create_time_dictionary():
    # 日付範囲の生成
    date_range = pd.date_range('2013-10-27','2014-08-01')
    time_dict = pd.Series(np.arange(len(date_range)),index=date_range)
    
    # 辞書をファイルに保存
    with open('data/time_dict.csv','wb') as fw:
        pickle.dump(time_dict,fw)
    return time_dict


def generate_course_dictionary():
    # コースIDの読み込みとマッピング
    df = pd.read_csv('data/date.csv',usecols=[0])
    course_mapping = pd.factorize(df.course_id)[1]
    course_dict = dict(zip(course_mapping,range(len(course_mapping))))
    
    # 辞書をファイルに保存
    with open('data/course_id_mapping.csv','wb') as fw:
        pickle.dump(course_dict,fw)
    print "course_mapping作成完了"
    return course_dict


def build_object_dictionary():
    # 訓練データのオブジェクトマッピング
    df = pd.read_csv('data/log_train.csv',usecols=[4])
    obj_mapping = pd.factorize(df.object)[1]
    obj_dict = dict(zip(obj_mapping,range(len(obj_mapping))))
    
    # テストデータのオブジェクトマッピング
    df2 = pd.read_csv('data/test/log_test.csv',usecols=[4])
    obj_mapping2 = pd.factorize(df2.object)[1]
    # 訓練データにないオブジェクトの追加
    new_objects = [w for w in obj_mapping2 if w not in obj_mapping]
    obj_dict2 =dict(zip(new_objects,np.arange(len(obj_mapping),len(obj_mapping)+len(new_objects))))
    
    # 辞書の結合
    obj_dict.update(obj_dict2)
    
    # 辞書をファイルに保存
    with open('data/object_mapping.csv','wb') as fw:
        pickle.dump(obj_dict,fw)
    print "object_mapping作成完了"
    return obj_dict


def map_time_value(x):
    x = x[:10]
    return time_dictionary[x]


def map_object_value(x):
    return object_dictionary[x]


def map_course_value(x):
    return course_dictionary[x]

# 辞書の生成
time_dictionary = create_time_dictionary()
course_dictionary = generate_course_dictionary()
object_dictionary = build_object_dictionary()



def process_log_train_data():
    print "訓練ログデータの読み込み"
    df1 = pd.read_csv('data/log_train.csv',converters={1:map_time_value,4:map_object_value})
    print df1.head()
    
    # ソースとイベントのマッピング
    df1.source = df1.source.map(lambda x:source_mapping[x])
    df1.event = df1.event.map(lambda x:event_mapping[x])
    print df1.head()
    print df1.tail()
    df1.to_csv('data/log_train_processed.csv',index=False)
    

def process_course_data():
    # 訓練データの処理
    df2 = pd.read_csv('data/enrollment_train.csv',usecols=[0,2],converters={2:map_course_value})
    df3 = pd.read_csv('data/date.csv',converters={0:map_course_value,1:map_time_value,2:map_time_value})
    df4 = pd.merge(df2,df3,on='course_id',how='outer')
    df4 = df4.sort_values(by='enrollment_id')
    print df4.tail(10)
    df4.to_csv("data/course_train_processed.csv",index=False)

    # テストデータの処理
    df1 = pd.read_csv('data/test/enrollment_test.csv',usecols=[0,2],converters={2:map_course_value})
    df4 = pd.merge(df1,df3)
    df4 = df4.sort_values(by='enrollment_id')
    print df4.tail(10)
    df4.to_csv("data/test/course_test_processed.csv",index=False)



def process_log_test_data():
    print "テストログデータの読み込み"
    df1 = pd.read_csv('data/test/log_test.csv',converters={1:map_time_value,4:map_object_value})
    print df1.tail(10)
    df1.source = df1.source.map(lambda x:source_mapping[x])
    df1.event = df1.event.map(lambda x:event_mapping[x])
    print df1.tail(10)
    df1.to_csv('data/test/log_test_processed.csv',index=False)

# 前処理の実行
process_log_train_data()
process_log_test_data()
process_course_data()
  1. さまざまな機械学習手法を用いたモデル構築と予測
#!/usr/bin/env python
# coding=utf-8

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
import pickle


debug_mode=True
if debug_mode:
    sample_size=5000
else:
    sample_size = 20000


class DropoutPredictor(object):

    course_map = {}
    def __init__(self):
        print "KDD Cup 2015 コンテストへようこそ"

        with open("data/course_mapping.pkl", 'rb') as fr:
            self.course_map = pickle.load(fr)

    @staticmethod
    def convert_date_to_number(s):
        return pd.to_datetime(s)


    def normalize_result(self,x):
        if x<0.0001:
            x=0
        elif x>0.98:
            x=1.0
        return x
    
    def normalize_course_id(self,c):
        return self.course_map[c]

    def load_training_data(self):
        # ログデータの読み込みと前処理
        df1 = pd.read_csv('./data/log_train.csv',usecols=[0,2,3,4])
        df1.source=pd.factorize(df1.source)[0]
        df1.event=pd.factorize(df1.event)[0]
        df1.object=pd.factorize(df1.object)[0]
        
        # グループ化による集計
        gp = df1.groupby("enrollment_id")
        gp2 = df1.groupby(["enrollment_id","source"])

        # コース情報の読み込み
        df2 = pd.read_csv('data/enrollment_train.csv',usecols=[2])
        df2.course_id=pd.factorize(df2.course_id)[0]

        # 正解ラベルの読み込み
        df3 = pd.read_csv('data/truth_train.csv',usecols=[1],names=["dropout"])
        
        # ピボットテーブルによる特徴量生成
        feature_data = df1.pivot_table("source",index="enrollment_id",columns="event",aggfunc="count",fill_value=0)
        
        # 追加特徴量の生成
        feature_data["browser_count"] = gp2.event.count().unstack()[0]
        feature_data["server_count"] = gp2.event.count().unstack()[1]
        feature_data["course_id"]=df2.course_id
        feature_data["total_actions"] = gp.event.count()
        feature_data["object_std"] = gp.object.std()
        feature_data["event_var"] =gp.event.var()
        feature_data["event_mean"] =gp.event.mean()
        feature_data = feature_data.fillna(0)
        print feature_data.head()
        
        # 特徴量とラベルの準備
        X = feature_data.values
        y = np.ravel(df3["dropout"])
        return X,y



    def load_test_data(self):
        # テストデータの読み込みと前処理
        df1 = pd.read_csv('data/test/log_test.csv',usecols=[0,2,3,4])
        df1.source = pd.factorize(df1.source)[0]
        df1.event = pd.factorize(df1.event)[0]
        df1.object = pd.factorize(df1.object)[0]
           
        # グループ化による集計
        gp = df1.groupby("enrollment_id")
        gp2 = df1.groupby(["enrollment_id","source"])

        # コース情報の読み込み
        df2 = pd.read_csv("data/test/enrollment_test.csv",usecols=[2])
        df2.course_id = pd.factorize(df2.course_id)[0]

        # ピボットテーブルによる特徴量生成
        feature_data = df1.pivot_table("source",index="enrollment_id",columns="event",aggfunc="count",fill_value=0)
        
        # 追加特徴量の生成
        feature_data["browser_count"] = gp2.event.count().unstack()[0]
        feature_data["server_count"] = gp2.event.count().unstack()[1]
        feature_data["course_id"] = df2.course_id
        feature_data["total_actions"] = gp.event.count()
        feature_data["object_std"] = gp.object.std()
        feature_data["event_var"] = gp.event.var()
        feature_data["event_mean"] = gp.event.mean()
        feature_data = feature_data.fillna(0)
        print "テストデータの先頭5件:\n",feature_data.head()
        
        return feature_data.values

    
    def train_gbdt_model(self,x_train,x_test,y_train,y_test,test_data):
        # GBDTモデルの構築と学習
        model = GradientBoostingClassifier(n_estimators=450,learning_rate=0.1,random_state=20)
        model.fit(x_train,y_train)
        
        # 評価
        y_pred = model.predict_proba(x_test)[:,1]
        score = roc_auc_score(y_test,y_pred)
        print "GBDTモデルのスコア: ",score
        
        # 予測
        predictions = model.predict_proba(test_data)[:,1]
        print "予測結果の先頭5件:", predictions[:5]
        self.save_predictions(predictions,"data/test/gbdt_predictions.csv")


    def train_svm_model(self,x_train,x_test,y_train,y_test,test_data):
        # SVMモデルのハイパーパラメータチューニング
        param_grid = [{'kernel':['poly'],'C':[10,500,1200]},
                      {'kernel':['linear'],'C':[200,500,800]}]
        model = GridSearchCV(SVC(probability=True),param_grid,cv=5,scoring="roc_auc")
        
        # モデルの学習
        model.fit(x_train,y_train)
        print "最適なパラメータ: "
        print model.best_params_

        # 評価
        y_pred = model.predict_proba(x_test)[:,1]
        score = roc_auc_score(y_test,y_pred)
        print "SVMモデルのスコア...",score
        
        # 予測
        predictions = model.predict_proba(test_data)[:,1]
        self.save_predictions(predictions,"data/test/svm_predictions"+str(score)+".csv")
        return predictions[:5]


    def save_predictions(self,predictions,fileName):
        # 予測結果の保存
        enrollment_test = pd.read_csv('./data/test/enrollment_test.csv',usecols=[0])
        enrollment_test['dropout'] = predictions 
        result = enrollment_test[['enrollment_id','dropout']];
        print "***"*30
        print result.head()
        result.to_csv(fileName,index=False,header=False)
        


    def predict_dropout(self):
        # 予測パイプラインの実行
        print "訓練データの読み込み..."
        X,y = self.load_training_data()
       
        # データの分割
        x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.23,random_state=20)
        print "テストデータの読み込み..."
        test_data = self.loadTestData()
        
        # GBDTモデルの構築
        print "GBDTモデルの構築..."
        self.train_gbdt_model(x_train,x_test,y_train,y_test,test_data)
        
        # SVMモデルの構築
        x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=28)
        print "SVMモデルの構築..."
        self.train_svm_model(x_train,x_test,y_train,y_test,test_data)
   
    def ensemble_results(self):
       # アンサンブルによる最終予測
       print "アンサンブルによる予測..."
       df_gbdt = pd.read_csv("data/test/gbdt_predictions.csv",header=None,names=["id","dropout1"])
       df_svm = pd.read_csv("data/test/svm_predictions.csv",header=None,usecols=[1],names=["id","dropout2"])
       
       # 最終予測の計算
       df = pd.concat([df_gbdt,df_svm],axis=1)
       df["dropout"] = df.dropout1*0.7+df.dropout2*0.3
       df["dropout"] = map(lambda x:self.normalize_result(x),df["dropout"])
       print df.head()
       
       # 不要な列の削除と保存
       df.drop(["dropout1","dropout2"],axis=1,inplace=True)
       print df.head()
       df.to_csv("data/test/final_predictions.csv",header=False,index=False)
       


if __name__ == '__main__':
    predictor = DropoutPredictor()
    predictor.predict_dropout()
    predictor.ensemble_results()
    print "処理完了"  # 精度、再現率、AUC値は約84%程度
  1. 特徴量エンジニアリングの深化とAUC値の向上(約89%)
#!/usr/bin/env python
# coding=utf-8

import numpy as np
import pandas as pd
import pickle as cPickle

from sklearn import svm
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import scale

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score



def normalize_predictions( x ):
    if x<0.000001:
        x=0
    elif x>0.96:
        x=1
    return x

def calculate_duration(x):
    return x.max()-x.min()

def load_training_features():
    # 訓練データの読み込み
    df1 = pd.read_csv('data/log_train_processed.csv')
    print df1.head()
    print df1.tail()
    
    # 正解ラベルの読み込み
    df2 = pd.read_csv('data/truth_train.csv',header=None,usecols=[1],names=["dropout"])
    
    # コース情報の読み込み
    df3 = pd.read_csv('data/course_train_processed.csv',usecols=[1,2,3])
    
    # グループ化による特徴量生成
    grouped_data = df1.groupby("enrollment_id")

    # 基本的な特徴量: イベントタイプごとのカウント
    base_features = df1.pivot_table("source",index='enrollment_id',columns="event",aggfunc='count',fill_value=0)
    
    # イベントに関する統計量
    event_stats = grouped_data.event.describe().unstack()

    # 時間に関する統計量
    time_stats = grouped_data.time.describe().unstack()
    time_stats = time_stats.drop('count',axis=1)

    # ソースに関する統計量
    source_stats = grouped_data.source.describe().unstack()
    source_stats = source_stats.drop(['count','min','max'],axis=1)

    # オブジェクトに関する統計量
    object_stats = grouped_data.object.describe().unstack()
    object_stats = object_stats.drop(['count'],axis=1)
    
    # 特徴量の結合
    features = pd.concat([base_features,event_stats],axis=1)
    features = pd.concat([features,time_stats],axis=1)
    features = pd.concat([features,source_stats],axis=1)
    features = pd.concat([features,object_stats],axis=1)
    
    # コース固有の特徴量: 持続期間、ID、開始日、終了日
    features['duration'] = grouped_data.time.apply(calculate_duration)
    features["course_id"] = df3["course_id"].values
    features["start_date"] = df3["from"].values
    features["end_date"] = df3["to"].values
    
    # 特徴量の前処理と保存
    print "元の特徴量データ: "
    print features.tail() 
    features = features.fillna(0)
    features.to_csv('data/train_features.csv',index=False)
    
    # 特徴量の正規化
    X = features.values 
    X = scale(X)
    
    # ラベルの準備
    y = np.ravel(df2['dropout'])   
    print "ラベルの例: ",y[:5]
    return X,y

def load_test_features():
    # テストデータの読み込み
    df1 = pd.read_csv('data/test/log_test_processed.csv')
    print df1.head()
    
    # コース情報の読み込み
    df3 = pd.read_csv('data/test/course_test_processed.csv',usecols=[1,2,3])

    # グループ化による特徴量生成
    grouped_data = df1.groupby("enrollment_id")

    # 基本的な特徴量: イベントタイプごとのカウント
    base_features = df1.pivot_table("source",index='enrollment_id',columns="event",aggfunc='count',fill_value=0)
    
    # イベントに関する統計量
    event_stats = grouped_data.event.describe().unstack()

    # 時間に関する統計量
    time_stats = grouped_data.time.describe().unstack()
    time_stats = time_stats.drop('count',axis=1)

    # ソースに関する統計量
    source_stats = grouped_data.source.describe().unstack()
    source_stats = source_stats.drop(['count','min','max'],axis=1)

    # オブジェクトに関する統計量
    object_stats = grouped_data.object.describe().unstack()
    object_stats = object_stats.drop(['count'],axis=1)

    # 特徴量の結合
    features = pd.concat([base_features,event_stats],axis=1)
    features = pd.concat([features,time_stats],axis=1)
    features = pd.concat([features,source_stats],axis=1)
    features = pd.concat([features,object_stats],axis=1)

    # コース固有の特徴量
    features['duration'] = grouped_data.time.apply(calculate_duration)
    features["course_id"] = df3["course_id"].values
    features["start_date"] = df3["from"].values
    features["end_date"] = df3["to"].values
    
    # 特徴量の前処理と保存
    print "テスト特徴量データ: "
    print features.tail(10)
    features = features.fillna(0)
    features.to_csv('data/test/test_features.csv',index=False)
    
    # テストデータの準備
    test_features = features.values
    test_features = scale(test_features)
    return test_features


def train_svm_model(x_train,x_test,y_train,y_test,test_features):
    # SVMモデルの構築と学習
    model = svm.SVC(kernel='linear',probability=True,random_state=42)
    model.fit(x_train,y_train)
    
    # 評価
    predictions = model.predict_proba(x_test)[:,1]
    score = roc_auc_score(y_test,predictions) 
    print "SVMモデルのスコア:...",score
    
    # 予測結果の保存
    save_predictions(predictions,'data/test/svm_predictions.csv')


def train_logistic_regression(x_train,x_test,y_train,y_test,test_features):
    # ロジスティック回帰モデルの構築と学習
    model = linear_model.LogisticRegression()
    model.fit(x_train,y_train)
    
    # 評価
    predictions = model.predict_proba(x_test)[:,1]
    score = roc_auc_score(y_test,predictions)
    print "ロジスティック回帰モデルのスコア: ",score
    
    # 予測値の正規化
    normalized_predictions = map(normalize_predictions,predictions)
    normalized_score = roc_auc_score(y_test,normalized_predictions)
    print "正規化後のスコア ... ",normalized_score
    
    # 予測結果の保存
    test_predictions = model.predict_proba(test_features)[:,1]
    save_predictions(test_predictions,'data/test/lr_predictions.csv')

def train_random_forest(x_train,x_test,y_train,y_test,test_features):
    # ランダムフォレストモデルの構築と学習
    model = RandomForestClassifier(n_estimators=100)
    model.fit(x_train,y_train)  # 修正: x_train, x_train -> x_train, y_train
    predictions = model.predict_proba(x_test)[:,1]
    score = roc_auc_score(y_test,predictions)
    
    # 予測
    test_predictions = model.predict(test_features)[:,1]
    print "ランダムフォレストモデルのスコア: ",score
    save_predictions(test_predictions,'./data/test/rf_predictions.csv')


def  train_gradient_boosting(x_train,x_test,y_train,y_test,test_features):
    # 勾配ブースティングモデルの構築と学習
    model = GradientBoostingClassifier(n_estimators=500)
    model.fit(x_train,y_train)
    
    # 評価
    predictions = model.predict_proba(x_test)[:,1]
    score = roc_auc_score(y_test,predictions)
    
    # 予測
    test_predictions = model.predict_proba(test_features)[:,1]
    print "勾配ブースティングモデルのスコア: ",score
    save_predictions(test_predictions,'data/test/gbdt_predictions'+str(score)+'.csv')


def save_predictions(predictions,fileName):
    # 予測結果の保存
    df = pd.read_csv('data/test/enrollment_test.csv',usecols=[0])
    df['dropout'] = predictions
    print df.head()
    df.to_csv(fileName,index=False,header=False)

# 複数のモデルの結果をアンサンブル
def ensemble_predictions():
    # 複数のモデルの結果を読み込み
    base_df = pd.read_csv("data/test/gbdt_predictions.csv",header=None,names=["id","dropout"])
    model1_df = pd.read_csv("data/test/gbdt_predictions0.875919444048.csv",header=None,usecols=[1],names=["dropout1"])
    model2_df = pd.read_csv("data/test/final_predictions.csv",header=None,usecols=[1],names=["dropout2"])
    
    # 重み付き平均によるアンサンブル
    base_df["dropout"]  =base_df["dropout"]*0.5+ model1_df["dropout1"]*0.2+model2_df["dropout2"]*0.3
    base_df["dropout"] = base_df["dropout"]
    base_df.to_csv("data/test/final_predictions.csv",index=None,header=None)

# 前処理済みデータの読み込み(高速化用)
def load_preprocessed_training_data():
    df1 = pd.read_csv('data/train_features.csv')
    print df1.head()
    X = df1.values
    # X = scale(X)  # 必要に応じてコメント解除
    
    with open("data/train_labels.pkl", 'rb') as fr2:
        y = pickle.load(fr2)
    return X,y


def load_preprocessed_test_data():
    df1 = pd.read_csv('data/test/test_features.csv')
    test_features = df1.values
    # test_features = scale(test_features)  # 必要に応じてコメント解除
    return test_features


def run_prediction_pipeline():
    # アンサンブル予測の実行
    ensemble_predictions()
    
    print "訓練データの読み込み..."
    X,y = load_preprocessed_training_data()

    print "テストデータの読み込み..."
    test_features = load_preprocessed_test_data()

    # ロジスティック回帰モデルの実行
    print "\nロジスティック回帰モデルの実行..."
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.31,random_state=148)
    train_logistic_regression(x_train,x_test,y_train,y_test,test_features)
    
    # ランダムフォレストモデルの実行
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.28,random_state=151)
    print "\nランダムフォレストモデルの実行..."
    # train_random_forest(x_train,x_test,y_train,y_test,test_features)  # 必要に応じてコメント解除

    # 勾配ブースティングモデルの実行
    print "\n勾配ブースティングモデルの実行..."
    train_gradient_boosting(x_train,x_test,y_train,y_test,test_features)

    # SVMモデルの実行
    print "\nSVMモデルの実行..."
    train_svm_model(x_train,x_test,y_train,y_test,test_features)


if __name__ =="__main__":
    print "処理開始>>>"
    run_prediction_pipeline()
    print "処理完了"

タグ: 機械学習 特徴量工学 データ前処理 勾配ブースティング SVM

6月28日 16:31 投稿