2019年7月12日金曜日

Mask R-CNNをやってみた

NewralNetwork Runtimeってとっても便利ですね。
おじさん、NewralNetwork Runtimeを使って、前からやりたかったMask R-CNNをやってみました。

Mask R-CNNって写真の中にある物の、位置、名前、マスク値がわかります。
マスク値がわかると何がよいかっていうと、「物の向き」や「道路の走れる領域」とかもマスク値からわかります。

あと物までの距離(深度)がわかれば、カメラの映像だけで自動運転をやる基礎技術がだいたいそろったことになるのですが、それは先月末にGoogleが画期的なアルゴリズムを見つけたようです。
これもデータが手に入り次第試してみたいと思います。

なので、本日はMask R-CNNをやってみます。
まず、いつものようにどっかのサイトを調べます。

次に、先日作ったいつもの最小のNewralNetwork Runtimeを取ってきます。


こんな感じでコードを書けばよさそう。
---------------------------------------
#include<stdio.h>
#include<iostream>
#include<fstream>
#include<string>
#include<vector>

#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION


#include "opencv2_core.hpp"
#include "opencv2_imgproc.hpp"
#include "opencv2_imgproc_imgproc_c.h"
#include "opencv2_dnn.hpp"

#include "stb_image.h"
#include "stb_image_write.h"

void changeb_g(unsigned char* p, int x, int y, int c)
{
int ct = x*y;
int i;
unsigned char t;
for (i = 0; i < ct; i++) {
t = p[0];
p[0] = p[2];
p[2] = t;
p[3] = 255;
p += c;
}
}


std::vector<std::string> classes;
std::vector<cv::Scalar> colors;

float confThreshold = 0.5; // Confidence threshold
float maskThreshold = 0.3; // Mask threshold


void label_init()
{
// Load names of classes
std::string classesFile = "mscoco_labels.names";
std::ifstream ifs(classesFile.c_str());
std::string line;
while (getline(ifs, line)) classes.push_back(line);

// Load the colors
std::string colorsFile = "colors.txt";
std::ifstream colorFptr(colorsFile.c_str());
while (getline(colorFptr, line)) {
char* pEnd;
double r, g, b;
r = strtod(line.c_str(), &pEnd);
g = strtod(pEnd, NULL);
b = strtod(pEnd, NULL);
cv::Scalar color = cv::Scalar(r, g, b, 255.0);
colors.push_back(cv::Scalar(r, g, b, 255.0));
}
}


std::string format(const char* format, ...)
{
va_list args;
va_start(args, format);
#ifndef _MSC_VER
size_t size = std::snprintf(nullptr, 0, format, args) + 1; // Extra space for '\0'
std::unique_ptr<char[]> buf(new char[size]);
std::vsnprintf(buf.get(), size, format, args);
return std::string(buf.get(), buf.get() + size - 1); // We don't want the '\0' inside
#else
int size = _vscprintf(format, args);
std::string result(++size, 0);
vsnprintf_s((char*)result.data(), size, _TRUNCATE, format, args);
return result;
#endif
va_end(args);

}
// Draw the predicted bounding box, colorize and show the mask on the image
void drawBox(cv::Mat& frame, int classId, float conf, cv::Rect box, cv::Mat& objectMask)
{
//Draw a rectangle displaying the bounding box
rectangle(frame, cv::Point(box.x, box.y), cv::Point(box.x + box.width, box.y + box.height), cv::Scalar(255, 178, 50), 3);

//Get the label for the class name and its confidence
std::string label = format("%.2f", conf);
if (!classes.empty())
{
CV_Assert(classId < (int)classes.size());
label = classes[classId] + ":" + label;
}

//Display the label at the top of the bounding box
int baseLine;
cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
box.y = cv::max(box.y, labelSize.height);
rectangle(frame, cv::Point(box.x, box.y - round(1.5*labelSize.height)), cv::Point(box.x + round(1.5*labelSize.width), box.y + baseLine), cv::Scalar(255, 255, 255), cv::FILLED);
putText(frame, label, cv::Point(box.x, box.y), cv::FONT_HERSHEY_SIMPLEX, 0.75, cv::Scalar(0, 0, 0), 1);

cv::Scalar color = colors[classId%colors.size()];
// Resize the mask, threshold, color and apply it on the image
resize(objectMask, objectMask, cv::Size(box.width, box.height));
cv::Mat mask = (objectMask > maskThreshold);
cv::Mat coloredRoi = (0.3 * color + 0.7 * frame(box));
coloredRoi.convertTo(coloredRoi, CV_8UC3);

// Draw the contours on the image
std::vector<cv::Mat> contours;
cv::Mat hierarchy;
mask.convertTo(mask, CV_8U);
findContours(mask, contours, hierarchy, cv::RETR_CCOMP, cv::CHAIN_APPROX_SIMPLE);
drawContours(coloredRoi, contours, -1, color, 5, cv::LINE_8, hierarchy, 100);
coloredRoi.copyTo(frame(box), mask);
}

// For each frame, extract the bounding box and mask for each detected object
void postprocess(cv::Mat& frame, const std::vector<cv::Mat>& outs)
{
cv::Mat outDetections = outs[0];
cv::Mat outMasks = outs[1];

// Output size of masks is NxCxHxW where
// N - number of detected boxes
// C - number of classes (excluding background)
// HxW - segmentation shape
const int numDetections = outDetections.size[2];
const int numClasses = outMasks.size[1];

outDetections = outDetections.reshape(1, outDetections.total() / 7);
for (int i = 0; i < numDetections; ++i)
{
float score = outDetections.at<float>(i, 2);
if (score > confThreshold)
{
// Extract the bounding box
int classId = static_cast<int>(outDetections.at<float>(i, 1));
int left = static_cast<int>(frame.cols * outDetections.at<float>(i, 3));
int top = static_cast<int>(frame.rows * outDetections.at<float>(i, 4));
int right = static_cast<int>(frame.cols * outDetections.at<float>(i, 5));
int bottom = static_cast<int>(frame.rows * outDetections.at<float>(i, 6));

left = cv::max(0, cv::min(left, frame.cols - 1));
top = cv::max(0, cv::min(top, frame.rows - 1));
right = cv::max(0, cv::min(right, frame.cols - 1));
bottom = cv::max(0, cv::min(bottom, frame.rows - 1));
cv::Rect box = cv::Rect(left, top, right - left + 1, bottom - top + 1);

// Extract the mask for the object
cv::Mat objectMask(outMasks.size[2], outMasks.size[3], CV_32F, outMasks.ptr<float>(i, classId));

// Draw bounding box, colorize and show the mask on the image
drawBox(frame, classId, score, box, objectMask);

}
}
}



int main()
{
unsigned char* p;
int x=-1, y=-1, n=-1;

//
p = stbi_load("zyobandou.jpg", &x, &y, &n, 4);
//p = stbi_load("apple.jpg", &x, &y, &n, 4);
if (p == NULL || x < 1 || y < 1)return 1;
changeb_g(p, x, y, 4);
cv::Mat color = cv::Mat(y, x, CV_8UC4);
memcpy(color.data, p, x * 4 * y);
stbi_image_free(p);

//
cv::Mat frame, blob;
cv::cvtColor(color, frame, CV_BGRA2BGR);
//cv::resize(img, img, cv::Size(224, 224));

std::string textGraph = "mask_rcnn_inception_v2_coco_2018_01_28.pbtxt";
std::string modelWeights = "frozen_inference_graph.pb";

cv::dnn::Net net = cv::dnn::readNetFromTensorflow(modelWeights, textGraph);
net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);

cv::dnn::blobFromImage(frame, blob, 1.0, cv::Size(frame.cols, frame.rows), cv::Scalar(), true, false);
net.setInput(blob);

label_init();
// Runs the forward pass to get output from the output layers
std::vector<cv::String> outNames(2);
outNames[0] = "detection_out_final";
outNames[1] = "detection_masks";
    std::vector<cv::Mat> outs;
net.forward(outs, outNames);

// Extract the bounding box and mask for each of the detected objects
postprocess(frame, outs);

cv::cvtColor(frame, color, CV_BGR2BGRA);
x = color.cols;
y = color.rows;
changeb_g(color.data, x, y, 4);
stbi_write_png("result.png", x, y, 4, color.data, 4 * x);

return 0;
}

---------------------------------------

たまには日本の画像で試験してみようと思い、常磐道の写真で実験。




なんかそれっぽくちゃんとできてるじゃん!
こういうのを数時間のプログラムで作れる時代なんですね。

0 件のコメント:

コメントを投稿