ARMの最適化されたFAST計算

私はアームネオンライブラリを使用してARM cortex a8で約5msのORB機能計算を見つけました。しかし、私は既にFASTフィーチャ検出で苦労しています。私が実装しようとしている論文はhereです。まず第一に、BrightとDarkの制約についてはわかりません。だから私の理解では、中央のピクセルの周りに9つの濃いピクセルまたは9の明るいピクセルがある場合は、FASTを確認する必要があります。だから私は両方をチェックする。しかし、今私は実装が最終的なシフト操作をせずに平均で3倍長くかかるという問題があります。それがコーナーであれば、全面的にopencvからの平均計算です。だから、今のところ私のコードは、誰かが私がそれにすることができるいくつかのoptimizatiosに私を指すことができるかもしれない。ARMの最適化されたFAST計算

 //detect with opncv 
     Clock::time_point t0 = Clock::now(); 
     detectors[y]->detect(img, ocv_kps); 
     Clock::time_point t1 = Clock::now(); 

     vector<Point2f> my_kps; 
     //threshhold for FAST 
     const uchar th = 8; 

     int b_cnt = 0; 
     int d_cnt = 0; 
     //array with four possible corners to be processed in parallel 
     uint32_t id_arr[4]; 
     uint32_t ib_arr[4]; 

     Clock::time_point t01 = Clock::now(); 
     for (int i = 3; i < img.rows - 3; i++) { 
      //get pointer to seven Image rows three above and three below center and center itself 
      const uchar* Mt3 = img.ptr<uchar>(i - 3); 
      const uchar* Mt2 = img.ptr<uchar>(i - 2); 
      const uchar* Mt1 = img.ptr<uchar>(i - 1); 
      const uchar* Mc = img.ptr<uchar>(i); 
      const uchar* Mb1 = img.ptr<uchar>(i + 1); 
      const uchar* Mb2 = img.ptr<uchar>(i + 2); 
      const uchar* Mb3 = img.ptr<uchar>(i + 3); 
      for (int j = 3; j < img.cols - 3; j++) { 
       const uchar j3 = j + 3; 
       const uchar j2 = j + 2; 
       const uchar j1 = j + 1; 
       const uchar jn3 = j - 3; 
       const uchar jn2 = j - 2; 
       const uchar jn1 = j - 1; 

       //image values for center left right top and bottom intensity of pixel 
       const uchar c = Mc[j]; 
       const uchar l = Mc[jn3]; 
       const uchar r = Mc[j3]; 
       const uchar t = Mt3[j]; 
       const uchar b = Mb3[j]; 

       //threshold for bright FAST constraint 
       const uchar thb = c + th; 

       //bools for bright constraint 
       const bool cbt = t > thb; 
       const bool cbb = b > thb; 
       const bool cbl = l > thb; 
       const bool cbr = r > thb; 

       uchar mt3; 
       uchar mt3n; 
       uchar mt2; 
       uchar mt2n; 
       uchar mt1; 
       uchar mt1n; 
       uchar mb3; 
       uchar mb3n; 
       uchar mb2; 
       uchar mb2n; 
       uchar mb1; 
       uchar mb1n; 
       bool bc = false; 
       //pre test do we have at least two points which fulfill bright constraint 
       if ((cbl && cbt) || (cbt && cbr) || (cbr && cbb) 
         || (cbb && cbl)) { 
        bc = true; 
        //get rest of image intensity values of circle 
        mt3 = Mt3[j1]; 
        mt3n = Mt3[jn1]; 
        mt2 = Mt2[j2]; 
        mt2n = Mt2[jn2]; 
        mt1 = Mt1[j3]; 
        mt1n = Mt1[jn3]; 
        mb3 = Mb3[j1]; 
        mb3n = Mb3[jn1]; 
        mb2 = Mb2[j2]; 
        mb2n = Mb2[jn2]; 
        mb1 = Mb1[j3]; 
        mb1n = Mb1[jn3]; 

        //values for bright constrain 
        ib_arr[b_cnt] = cbt | ((mt3) > thb) << 1 
          | ((mt2) > thb) << 2 | ((mt1) > thb) << 3 
          | (cbr << 4) | ((mb1) > thb) << 5 
          | ((mb2) > thb) << 6 | ((mb3) > thb) << 7 
          | cbb << 8 | ((mb3n) > thb) << 9 
          | ((mb2n) > thb) << 10 | ((mb1n) > thb) << 11 
          | (cbl) << 12 | ((mt1n) > thb) << 13 
          | ((mt2n) > thb) << 14 | ((mt3n) > thb) << 15 
          | (cbt) << 16 | ((mt3) > thb) << 17 
          | ((mt2) > thb) << 18 | ((mt1) > thb) << 19 
          | (cbr) << 20 | ((mb1) > thb) << 21 
          | ((mb2) > thb) << 22 | ((mb3) > thb) << 23; 
        b_cnt++; 
        //if we have four possible corners in array check if they are corners 
        if (b_cnt == 4) { 
         uint32x2x4_t IB = vld4_u32(ib_arr); 
         /* 
         * here the actual shift operation would take place 
         */ 
         b_cnt = 0; 
        } 
       } 

       //threshold for dark constraint 
       const uchar thd = c - th; 
       //bools for dark constraint 
       const bool cdl = l < thd; 
       const bool cdr = r < thd; 
       const bool cdt = t < thd; 
       const bool cdb = b < thd; 
       //pre test do we have at least two points which fulfill dark constraint 
       if ((cdl && cdt) || (cdt && cdr) || (cdr && cdb) 
         || (cdb && cdl)) { 
        //if bright pre test failed intensity values are not initialised 
        if (!bc) { 
         //get rest of image intensity values of circle 
         mt3 = Mt3[j1]; 
         mt3n = Mt3[jn1]; 
         mt2 = Mt2[j2]; 
         mt2n = Mt2[jn2]; 
         mt1 = Mt1[j3]; 
         mt1n = Mt1[jn3]; 
         mb3 = Mb3[j1]; 
         mb3n = Mb3[jn1]; 
         mb2 = Mb2[j2]; 
         mb2n = Mb2[jn2]; 
         mb1 = Mb1[j3]; 
         mb1n = Mb1[jn3]; 
        } 
        //bool values for dark constrain 
        id_arr[d_cnt] = cdt | ((mt3) < thd) << 1 
          | ((mt2) < thd) << 2 | ((mt1) < thd) << 3 
          | (cdr) << 4 | ((mb1) < thd) << 5 
          | ((mb2) < thd) << 6 | ((mb3) < thd) << 7 
          | (cdb) << 8 | ((mb3n) < thd) << 9 
          | ((mb2n) < thd) << 10 | ((mb1n) < thd) << 11 
          | (cdl) << 12 | ((mt1n) < thd) << 13 
          | ((mt2n) < thd) << 14 | ((mt3n) < thd) << 15 
          | (cdt) << 16 | ((mt3) < thd) << 17 
          | ((mt2) < thd) << 18 | ((mt1) < thd) << 19 
          | (cdr) << 20 | ((mb1) < thd) << 21 
          | ((mb2) < thd) << 22 | ((mb3) < thd) << 23; 
        d_cnt++; 
        //if we have four possible corners in array check if they are corners 
        if (d_cnt == 4) { 
         uint32x2x4_t IA = vld4_u32(id_arr); 
         /* 
         * here the actual shift operation would take place 
         */ 
         d_cnt = 0; 
        } 
        int h = cdt; 

       } 
      } 
     } 
     Clock::time_point t11 = Clock::now(); 
     cout << "my algorithm found " << my_kps.size() 
       << " and ocv found " << ocv_kps.size() << endl; 

     microseconds ms1 = std::chrono::duration_cast < microseconds 
       > (t1 - t0); 
     microseconds ms2 = std::chrono::duration_cast < microseconds 
       > (t11 - t01); 

     rs.Push((double) ms2.count()); 
     cout << "my algorithm duration " << ms2.count() 
       << " and ocv duration is " << ms1.count() << endl;

出典

2016-10-20 Felix Yah Batta Man

したがって、アームアセンブラでビットを掘った後。私はArmで少なくとも2倍高速で実行され、Fast9のOpenCv実装でビルドされたコードを思いついた。コードはGitHubで確認できます。私はそれを最適化するためのリコメンデーションについて非常に満足しています。 320×240グレースケール画像の上に私のアルゴリズムのためのOpenCVの

ため 2000ミリ秒を 1000ミリ秒：私のラズベリーパイ3では、それが一巡します。

出典

2016-11-17 08:09:00

ラズベリーパイの30fpsで動作するORB抽出器があります。

https://github.com/0xfaded/pislam

最適化は本当に黒魔術で、ARMは、A53の最適化ガイドをリリースしたことはありませんさらに悪いことします。私たちが持っている最高のものはa57です。これはおそらく同様のNEONユニットを持っています。

私は本当にここで完全な答えを提供することはできませんが、私のプロセスについて少しは分かります。

私のFASTエクストラクタの最初の部分は、テストピクセルのリングをロードし、コードと同じように16ビットのベクトルに変換します。私はasmを直接書くのではなく、代わりにgcc組み込み関数を使いました。

あなたが最初の比較がないことに気づくでしょう、各比較

ための命令の最小数を放出さ

スタックにすべてのレジスタをこぼさなかった。それでもしかし、私はgccがあることを確認しましたそのビットをマスクで分離しないでください。これは0x80でした。これにより、そうでなければ定数を保持していたレジスタが解放され、gccにレジスタを溢れさせない程度の十分な揺らぎの余地が与えられました。あなたはまた、いくつかのかなり陰惨な本質的な使い方がわかります

：これは

d0 |= test >= dark & 0x40; 
    l0 |= test >= light & 0x40;

と同等です

d0 = vbslq_u8(vdupq_n_u8(0x40u), vcgeq_u8(test, dark), d0); 
    l0 = vbslq_u8(vdupq_n_u8(0x40u), vcleq_u8(test, light), l0);

をGCCは喜んで後者をコンパイルしますが、1.5倍など多くの命令を発します。

2番目の部分では、16ビットベクトルのFAST-9テストが行われました。以下は16の命令にコンパイルされていますが、思いついたことを考えて、私はほとんど一ヶ月の間、オンとオフを考えました。

uint8x16_t t0 = vtstq_u8(d0, d1); 
    uint8x16_t t1 = vtstq_u8(d0, d1); 

    t0 = vbslq_u8(t0, l0, d0); 
    t1 = vbslq_u8(t1, l1, d1); 

    uint8x16_t cntLo = vclzq_u8(t0); 
    uint8x16_t testLo = t1 << (cntLo - 1); 
    asm("vceq.u8 %q0, %q0, #0" : [val] "+w" (testLo)); 

    uint8x16_t cntHi = vclzq_u8(t1); 
    uint8x16_t testHi = t0 << (cntHi - 1); 
    asm("vceq.u8 %q0, %q0, #0" : [val] "+w" (testHi)); 

    uint8x16_t result = (cntLo & testLo) | (cntHi & testHi); 
    result = vtstq_u8(result, result);

うるさく、GCCは定数ゼロと比較するための特別な命令である、vceq.u8 %q0, %q0, #0としてtestLo == 0をコンパイルしないであろう。私はこれらを手動で挿入して巻いて、別のカップルの指示を削った。

いくつかの洞察を提供する希望。 Fast.h

出典

2017-10-31 00:02:51 user364952

ARMの最適化されたFAST計算

答えて

関連する問題