日本語の分類
LightGBMによるテンプレート
形態素解析し説明変数・目的変数の行列の作成
import MeCab
from tqdm.auto import tqdm
parser = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")
assert parser.parse("COVID19").strip().split() == ["COVID19"], "辞書ファイルが古いです" # 辞書が反映されていないと落ちるはず
ws = set()
for text in data.tweet:
for w in parser.parse(text).split():
ws.add(w)
wi = {w:i for i, w in enumerate(ws)} # 単語とindexを対応させる
wnum = len(wi)
rnum = len(data)
X = np.zeros((rnum, wnum)) # 密行列を作成
for i, text in tqdm(enumerate(data.tweet_text), desc="creating X...", total=len(data)):
for w in parser.parse(text).split():
X[i, wi[w]] += 1 # 値を入れていく
y = data.label
学習部分
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import balanced_accuracy_score
param = {
"objective": "binary",
"metric": "auc",
"max_depth": 5,
"num_leaves": 5,
"learning_rate": 0.01,
"bagging_fraction": 0.5,
"feature_fraction": 0.5,
"lambda_l1": 0.1,
"lambda_l2": 0.1,
"bagging_seed": 777,
"verbosity": -1,
"seed": 777,
"n_jobs": -1 # 並列数
}
def train(X_train, y_train, category, param, eval_func, verbose, early_stopping_rounds, n_estimators):
X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=False)
trn_data = lgb.Dataset(X_trn, label=y_trn, categorical_feature=category)
val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=category)
num_round = n_estimators
clf = lgb.train(
param,
trn_data,
num_round,
valid_sets=[trn_data, val_data],
verbose_eval=verbose,
early_stopping_rounds=early_stopping_rounds)
eval_loss = eval_func(y_val, clf.predict(X_val))
print(f'end eval_loss={eval_loss}')
print(f"accuracy = {accuracy_score(y_val, clf.predict(X_val) > 0.5)}")
print(f"balanced = {balanced_accuracy_score(y_val, clf.predict(X_val) > 0.5)}")
print(f"f1 = {f1_score(y_val, clf.predict(X_val) > 0.5)}")
return {"tuple": (y_val, clf.predict(X_val)) ,
'eval_loss': eval_loss,
'model': clf}
obj = train(X, y, category=[], param=param, eval_func=roc_auc_score, verbose=1, early_stopping_rounds=10, n_estimators=1000)
clf = obj["model"]