2022年2月1日火曜日

lightGBMでMNISTをする。

おじさん、今月はシステムトレードの計算ができるようにするため、lightGBMの使い方を勉強し始めました。

ネットの情報ではlightGBMでMNISTができるらしい。

決定木でニューラルネットワークのようなことができるのね。

これは面白そう。

ということで、lightGBMの最初のプログラムでMNISTをやってみました。



しかし、Pythonのコードはたくさん落っこちているのですが、c++のサンプルコードはどこにも落ちていません。

マイクロソフトの公式サイトにもC++のサンプルがありません。

これ本当にC++でできんの?


一日中いろいろ試して、どうにか学習と予測のところまでC++でできました。

だけれど、C++だと学習がちょうどいいところで自動で終わってくれない。

このハイパーパラメータを自分で計算しないといけないっぽいです。

どうやって計算するんだ?


でもとりあえずできた。

実際にやってみてわかったのですが、lightGBMは他の機械学習のライブラリと比べてビルド時間や学習にかかる時間が圧倒的に速い。

調べてみると、MNISTもニューラルネットワークと比べて8倍くらい早いらしいです。

しかもMNISTのようにたくさんの次元をいれても高速。

これくらいさくさく学習できると、ストレスなくいろんな特徴量をいれることができますね。

Kaglleとかでみんなが使う理由がよくわかります。



ソースコード

--------------------------


#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include "LightGBM/c_api.h"


#define TRAIN_IMAGE "train-images-idx3-ubyte"

#define TRAIN_LABEL "train-labels-idx1-ubyte"

#define TEST_IMAGE "t10k-images-idx3-ubyte"

#define TEST_LABEL "t10k-labels-idx1-ubyte"


float data[]={


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,


0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,


0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,


0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


};




struct tensor {

float* data;

int cols;

int rows;

};


static struct tensor* create_tensor(int rows, int cols) {

struct tensor* ret;

ret = (struct tensor*)malloc(sizeof(struct tensor));

ret->data = (float*)malloc(sizeof(float) * rows * cols);

memset(ret->data, 0, sizeof(float) * rows * cols);

ret->cols = cols; ret->rows = rows;

return ret;

}

static void free_tensor(struct tensor* t)

{

if (t && t->data)free(t->data);

if (t)free(t);

}




static int buf2int(char* buf_) {

int ret;

unsigned char* buf = (unsigned char*)buf_;


ret = buf[0]; ret <<= 8; ret |= buf[1]; ret <<= 8;

ret |= buf[2]; ret <<= 8; ret |= buf[3];

return ret;

}

static struct tensor* load_image_file(const char* fn)

{

struct tensor* ret = NULL;

FILE* fp;

int sz, t, w, h, n, i, j;

char buf[4];


fp = fopen(fn, "rb");

if (fp == NULL)goto end;

fseek(fp, 0, SEEK_END);

sz = ftell(fp);

fseek(fp, 0, SEEK_SET);


fread(buf, 1, 4, fp);

t = buf2int(buf);

if (t != 0x803)goto end;


fread(buf, 1, 4, fp);

n = buf2int(buf);

fread(buf, 1, 4, fp);

w = buf2int(buf);

fread(buf, 1, 4, fp);

h = buf2int(buf);

if (h * w != 784)goto end;


ret = create_tensor(n, 784);

for (i = 0; i < n; i++) {

for (j = 0; j < 784; j++) {

fread(buf, 1, 1, fp);

ret->data[i * 784 + j] = (float)(buf[0] & 255);

}

}

end:

if (fp)fclose(fp);

return ret;

}



static struct tensor* load_label_file(const char* fn)

{

struct tensor* ret = NULL;

FILE* fp;

int sz, t, n, i, j;

char buf[4];


fp = fopen(fn, "rb");

if (fp == NULL)goto end;

fseek(fp, 0, SEEK_END);

sz = ftell(fp);

fseek(fp, 0, SEEK_SET);


fread(buf, 1, 4, fp);

t = buf2int(buf);

if (t != 0x801)goto end;


fread(buf, 1, 4, fp);

n = buf2int(buf);

ret = create_tensor(n, 1);

for (i = 0; i < n; i++) {

fread(buf, 1, 1, fp);

ret->data[i] = (float)buf[0];

}

end:

if (fp)fclose(fp);

return ret;

}



int main(int argc, char* argv[])

{

int ret;


DatasetHandle hx_train, hx_test;


struct tensor* x_train, * x_test;

struct tensor* y_train, * y_test;


x_train = load_image_file(TRAIN_IMAGE);

y_train = load_label_file(TRAIN_LABEL);

x_test = load_image_file(TEST_IMAGE);

y_test = load_label_file(TEST_LABEL);



ret = LGBM_DatasetCreateFromMat(x_train->data, C_API_DTYPE_FLOAT32, x_train->rows,

x_train->cols, 1, "", nullptr, &hx_train);

if (ret) {

printf("Error: LGBM_DatasetCreateFromMat()\n");

return 1;

}

ret = LGBM_DatasetSetField(hx_train, "label", y_train->data, y_train->rows, C_API_DTYPE_FLOAT32);

if (ret) {

printf("Error: LGBM_DatasetSetField()\n");

return 1;

}


ret = LGBM_DatasetCreateFromMat(x_test->data, C_API_DTYPE_FLOAT32, x_test->rows,

x_test->cols, 1, "", hx_train, &hx_test);

if (ret) {

printf("Error: LGBM_DatasetCreateFromMat()\n");

return 1;

}

ret = LGBM_DatasetSetField(hx_test, "label", y_test->data, y_test->rows, C_API_DTYPE_FLOAT32);

if (ret) {

printf("Error: LGBM_DatasetSetField()\n");

return 1;

}


printf("Create Data OK!!\n");


BoosterHandle bh;

ret = LGBM_BoosterCreate(hx_train, "objective=multiclass "

"num_class=10 num_boost_round=300 "

"early_stopping_round=5", &bh);

if (ret) {

printf("Error: LGBM_BoosterCreate()\n");

return 1;

}


ret = LGBM_BoosterAddValidData(bh, hx_test);

if (ret) {

printf("Error: LGBM_BoosterAddValidData()\n");

return 1;

}


int is_finished = 0;

double d_min = 10;

int ct_min=0;

for (int i = 0; i < 300; i++) {

double d = 0;

ret = LGBM_BoosterUpdateOneIter(bh, &is_finished);

if (ret) {

printf("Error: LGBM_BoosterUpdateOneIter()\n");

return 1;

}

int out_len = 10;

ret = LGBM_BoosterGetEval(bh, 1, &out_len, &d);

if (ret) {

printf("Error: LGBM_BoosterGetEval()\n");

return 1;

}

printf("%d %f\n",i,d);

if (d < d_min) {

d_min = d;

ct_min = 0;

}

else {

ct_min++;

if (ct_min >= 5)break;

}

}


printf("Update OK!!\n");


long long d_len = 10;

double d[10];

for (int j = 0; j < 784; j++) {

data[j] = data[j] * 255;

}

ret = LGBM_BoosterPredictForMat(bh, data, C_API_DTYPE_FLOAT32, 1, 784,

1, C_API_PREDICT_NORMAL, 0, -1, "", &d_len, d);

if (ret) {

printf("Error: LGBM_BoosterPredictForMat()\n");

return 1;

}

for (int i = 0; i < 10; i++) {

printf("d[%d]=%f\n", i, d[i]);

}


LGBM_BoosterFree(bh);

LGBM_DatasetFree(hx_train);

LGBM_DatasetFree(hx_test);

        free_tensor(x_train); free_tensor(y_train);

free_tensor(x_test); free_tensor(y_test);

return 0;

}

--------------------------


結果

--------------------------

[LightGBM] [Info] Load from binary file x_train

Create Data OK!!

[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 1.095544 seconds.

You can set `force_col_wise=true` to remove the overhead.

[LightGBM] [Info] Total Bins 109606

[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 629

[LightGBM] [Info] Start training from score -2.315501

[LightGBM] [Info] Start training from score -2.185988

[LightGBM] [Info] Start training from score -2.309610

[LightGBM] [Info] Start training from score -2.280987

[LightGBM] [Info] Start training from score -2.329271

[LightGBM] [Info] Start training from score -2.404064

[LightGBM] [Info] Start training from score -2.316346

[LightGBM] [Info] Start training from score -2.259366

[LightGBM] [Info] Start training from score -2.327732

[LightGBM] [Info] Start training from score -2.311121

0 1.679374

1 1.373263

2 1.159687

...

...

149 0.000824


Update OK!!


d[0]=0.000818

d[1]=0.000104

d[2]=0.236144

d[3]=0.014695

d[4]=0.573232

d[5]=0.003431

d[6]=0.016848

d[7]=0.002404

d[8]=0.142658

d[9]=0.009667

--------------------------


おー、なんかあってる。
でもCPPでlightGBMつかうの大変ね。


0 件のコメント:

コメントを投稿