Opencv LineMod源碼閱讀

linemod 算法來自:

Gradient Response Maps for Real-Time Detection of Textureless Objects,

Dominant Orientation Templates for Real-Time Detection of Texture-Less Objects


  • computeResponseMaps 的函數接口

static void computeResponseMaps(const Mat& src, std::vector<Mat>& response_maps)

Mat 輸入的是量化並且擴散的梯度圖,response_maps是8張不同方向梯度的響應圖,顧名思義就是這些圖的值越大,匹配度就越高。


static void computeResponseMaps(const Mat& src, std::vector<Mat>& response_maps)
  CV_Assert((src.rows * src.cols) % 16 == 0);

  // Allocate response maps
  for (int i = 0; i < 8; ++i)
    response_maps[i].create(src.size(), CV_8U);

  Mat lsb4(src.size(), CV_8U);
  Mat msb4(src.size(), CV_8U);

  for (int r = 0; r < src.rows; ++r)
    const uchar* src_r = src.ptr(r);
    uchar* lsb4_r = lsb4.ptr(r);
    uchar* msb4_r = msb4.ptr(r);

    for (int c = 0; c < src.cols; ++c)
      // Least significant 4 bits of spread image pixel
      lsb4_r[c] = src_r[c] & 15;
      // Most significant 4 bits, right-shifted to be in [0, 16)
      msb4_r[c] = (src_r[c] & 240) >> 4;

#if CV_SSSE3
  volatile bool haveSSSE3 = checkHardwareSupport(CV_CPU_SSSE3);
  if (haveSSSE3)
    const __m128i* lut = reinterpret_cast<const __m128i*>(SIMILARITY_LUT);
    for (int ori = 0; ori < 8; ++ori)
      __m128i* map_data = response_maps[ori].ptr<__m128i>();
      __m128i* lsb4_data = lsb4.ptr<__m128i>();
      __m128i* msb4_data = msb4.ptr<__m128i>();

      // Precompute the 2D response map S_i (section 2.4)
      for (int i = 0; i < (src.rows * src.cols) / 16; ++i)
        // Using SSE shuffle for table lookup on 4 orientations at a time
        // The most/least significant 4 bits are used as the LUT index
        __m128i res1 = _mm_shuffle_epi8(lut[2*ori + 0], lsb4_data[i]);
        __m128i res2 = _mm_shuffle_epi8(lut[2*ori + 1], msb4_data[i]);

        // Combine the results into a single similarity score
        map_data[i] = _mm_max_epu8(res1, res2);
    // For each of the 8 quantized orientations...
    for (int ori = 0; ori < 8; ++ori)
      uchar* map_data = response_maps[ori].ptr<uchar>();
      uchar* lsb4_data = lsb4.ptr<uchar>();
      uchar* msb4_data = msb4.ptr<uchar>();
      const uchar* lut_low = SIMILARITY_LUT + 32*ori;
      const uchar* lut_hi = lut_low + 16;

      for (int i = 0; i < src.rows * src.cols; ++i)
        map_data[i] = std::max(lut_low[ lsb4_data[i] ], lut_hi[ msb4_data[i] ]);
  • LUT計算




代碼來自於大神的github :

#include <iostream>
#include <vector>
using namespace std;

struct Node {
    int value;
    int prev;
    int next;

int main()
    std::vector<Node> nodes(8);
    for (int i = 0; i<8; i++){
        nodes[i].value = (1 << i);
        nodes[i].prev = i - 1;
        nodes[i].next = i + 1;
    nodes[0].prev = 7;
    nodes[7].next = 0;

    unsigned short LUT[8 * 2 * 16] = { 0 };

    for (int i = 0; i<8; i++){ // 8 ori
        for (int m = 0; m<2; m++){ // 2 seg
            for (int n = 0; n<16; n++){ // 16 index

                if (n == 0){ // no ori
                    LUT[n + m * 16 + i * 16 * 2] = 0;

                int res = (n << (m * 4));
                auto current_node_go_forward = nodes[i];
                auto current_node_go_back = nodes[i];
                int angle_diff = 0;
                while (1){
                    if ((current_node_go_forward.value & res) > 0 ||
                        (current_node_go_back.value & res) > 0){
                        current_node_go_back = nodes[current_node_go_back.prev];
                        current_node_go_forward = nodes[];
                LUT[n + m * 16 + i * 16 * 2] = 4 - angle_diff;

    for (int i = 0; i<8; i++){
        for (int m = 0; m<32; m++){
            cout << int(LUT[i * 32 + m]) << ", ";
        cout << "\n";

    return 0;


  • SSE計算



這裏的代碼用到了 _mm_shuffle_epi8以及_mm_max_epu8



Return value

The return value can be expressed by the following equations:

r0 = (mask0 & 0x80) ? 0 : SELECT(a, mask0 & 0x0f)

r1 = (mask1 & 0x80) ? 0 : SELECT(a, mask1 & 0x0f)


r15 = (mask15 & 0x80) ? 0 : SELECT(a, mask15 & 0x0f)





x86, x64

Header file <tmmintrin.h>


r0-r15 and mask0-mask15 are the sequentially ordered 8-bit components of return value r and parameter mask. r0 and mask0 are the least significant 8 bits.

SELECT(a, n) extracts the nth 8-bit parameter from a. The 0th 8-bit parameter is the least significant 8-bits.

mask provides the mapping of bytes from parameter a to bytes in the result. If the byte in mask has its highest bit set, the corresponding byte in the result will be set to zero.

Before you use this intrinsic, software must ensure that the processor supports the instruction.

#include <stdio.h>
#include <tmmintrin.h>

int main ()
    __m128i a, mask;

    a.m128i_i8[0] = 1;
    a.m128i_i8[1] = 2;
    a.m128i_i8[2] = 4;
    a.m128i_i8[3] = 8;
    a.m128i_i8[4] = 16;
    a.m128i_i8[5] = 32;
    a.m128i_i8[6] = 64;
    a.m128i_i8[7] = 127;
    a.m128i_i8[8] = -2;
    a.m128i_i8[9] = -4;
    a.m128i_i8[10] = -8;
    a.m128i_i8[11] = -16;
    a.m128i_i8[12] = -32;
    a.m128i_i8[13] = -64;
    a.m128i_i8[14] = -128;
    a.m128i_i8[15] = -1;

    mask.m128i_u8[0] = 0x8F;
    mask.m128i_u8[1] = 0x0E;
    mask.m128i_u8[2] = 0x8D;
    mask.m128i_u8[3] = 0x0C;
    mask.m128i_u8[4] = 0x8B;
    mask.m128i_u8[5] = 0x0A;
    mask.m128i_u8[6] = 0x89;
    mask.m128i_u8[7] = 0x08;
    mask.m128i_u8[8] = 0x87;
    mask.m128i_u8[9] = 0x06;
    mask.m128i_u8[10] = 0x85;
    mask.m128i_u8[11] = 0x04;
    mask.m128i_u8[12] = 0x83;
    mask.m128i_u8[13] = 0x02;
    mask.m128i_u8[14] = 0x81;
    mask.m128i_u8[15] = 0x00;

    __m128i res = _mm_shuffle_epi8(a, mask);

    printf_s("Result res:\t%2d\t%2d\t%2d\t%2d\n\t\t%2d\t%2d\t%2d\t%2d\n",
                res.m128i_i8[0], res.m128i_i8[1], res.m128i_i8[2], 
                res.m128i_i8[3], res.m128i_i8[4], res.m128i_i8[5], 
                res.m128i_i8[6], res.m128i_i8[7]);
                res.m128i_i8[8],  res.m128i_i8[9], res.m128i_i8[10], 
                res.m128i_i8[11], res.m128i_i8[12], res.m128i_i8[13], 
                res.m128i_i8[14], res.m128i_i8[15]);

    return 0;
  • hysteresisGradient


void hysteresisGradient(Mat& magnitude, Mat& quantized_angle,
                        Mat& angle, float threshold)
  // Quantize 360 degree range of orientations into 16 buckets
  // Note that [0, 11.25), [348.75, 360) both get mapped in the end to label 0,
  // for stability of horizontal and vertical features.
  Mat_<unsigned char> quantized_unfiltered;
  angle.convertTo(quantized_unfiltered, CV_8U, 16.0 / 360.0);

  // Zero out top and bottom rows
  /// @todo is this necessary, or even correct?
  memset(quantized_unfiltered.ptr(), 0, quantized_unfiltered.cols);
  memset(quantized_unfiltered.ptr(quantized_unfiltered.rows - 1), 0, quantized_unfiltered.cols);
  // Zero out first and last columns
  for (int r = 0; r < quantized_unfiltered.rows; ++r)
    quantized_unfiltered(r, 0) = 0;
    quantized_unfiltered(r, quantized_unfiltered.cols - 1) = 0;

  // Mask 16 buckets into 8 quantized orientations
  for (int r = 1; r < angle.rows - 1; ++r)
    uchar* quant_r = quantized_unfiltered.ptr<uchar>(r);
    for (int c = 1; c < angle.cols - 1; ++c)
      quant_r[c] &= 7;

  // Filter the raw quantized image. Only accept pixels where the magnitude is above some
  // threshold, and there is local agreement on the quantization.
  quantized_angle = Mat::zeros(angle.size(), CV_8U);
  for (int r = 1; r < angle.rows - 1; ++r)
    float* mag_r = magnitude.ptr<float>(r);

    for (int c = 1; c < angle.cols - 1; ++c)
      if (mag_r[c] > threshold)
  // Compute histogram of quantized bins in 3x3 patch around pixel
        int histogram[8] = {0, 0, 0, 0, 0, 0, 0, 0};

        uchar* patch3x3_row = &quantized_unfiltered(r-1, c-1);

  patch3x3_row += quantized_unfiltered.step1();

  patch3x3_row += quantized_unfiltered.step1();

  // Find bin with the most votes from the patch
        int max_votes = 0;
        int index = -1;
        for (int i = 0; i < 8; ++i)
          if (max_votes < histogram[i])
            index = i;
            max_votes = histogram[i];

  // Only accept the quantization if majority of pixels in the patch agree
  static const int NEIGHBOR_THRESHOLD = 5;
        if (max_votes >= NEIGHBOR_THRESHOLD)
<uchar>(r, c) = uchar(1 << index);




[0, 11.25) convert的結果的0   [348.75, 360)對應的結果是16

[11.25 33.75)的結果是1,觀察的結果:

13 33 343.5 -> 15 ->7
28 14 351 -> 16  ->0
13 13 5 -> 0  ->0
43 38 178 ->8  ->0
12 12 11.98 ->1 ->1

2. &7操作




3   if (max_votes >= NEIGHBOR_THRESHOLD)


uchar(1 << index)  index是方向標識,1<<7就是對應128


  • 特徵匹配

void Detector::match(const std::vector<Mat>& sources, float threshold, std::vector<Match>& matches,

                     const std::vector<String>& class_ids, OutputArrayOfArrays quantized_images,
                     const std::vector<Mat>& masks) const


  • 根據定義特徵計算線性存儲的ResponseMap
  // For each pyramid level, precompute linear memories for each modality
  std::vector<Size> sizes;
  for (int l = 0; l < pyramid_levels; ++l)
    int T = T_at_level[l];
    std::vector<LinearMemories>& lm_level = lm_pyramid[l];

    if (l > 0)
      for (int i = 0; i < (int)quantizers.size(); ++i)

    Mat quantized, spread_quantized;
    std::vector<Mat> response_maps;
    for (int i = 0; i < (int)quantizers.size(); ++i)
      spread(quantized, spread_quantized, T);
      computeResponseMaps(spread_quantized, response_maps);

      LinearMemories& memories = lm_level[i];
      for (int j = 0; j < 8; ++j)
        linearize(response_maps[j], memories[j], T);

      if (quantized_images.needed()) //use copyTo here to side step reference semantics.
        quantized.copyTo(quantized_images.getMatRef(static_cast<int>(l*quantizers.size() + i)));





  • 在金字塔高層計算相似度函數
static void similarity(const std::vector<Mat>& linear_memories, const Template& templ,
                Mat& dst, Size size, int T)
  // 63 features or less is a special case because the max similarity per-feature is 4.
  // 255/4 = 63, so up to that many we can add up similarities in 8 bits without worrying
  // about overflow. Therefore here we use _mm_add_epi8 as the workhorse, whereas a more
  // general function would use _mm_add_epi16.
  CV_Assert(templ.features.size() <= 63);
  /// @todo Handle more than 255/MAX_RESPONSE features!!

  // Decimate input image size by factor of T
  int W = size.width / T;
  int H = size.height / T;

  // Feature dimensions, decimated by factor T and rounded up
  int wf = (templ.width - 1) / T + 1;
  int hf = (templ.height - 1) / T + 1;

  // Span is the range over which we can shift the template around the input image
  int span_x = W - wf;
  int span_y = H - hf;

  // Compute number of contiguous (in memory) pixels to check when sliding feature over
  // image. This allows template to wrap around left/right border incorrectly, so any
  // wrapped template matches must be filtered out!
  int template_positions = span_y * W + span_x + 1; // why add 1?
  //int template_positions = (span_y - 1) * W + span_x; // More correct?

  /// @todo In old code, dst is buffer of size m_U. Could make it something like
  /// (span_x)x(span_y) instead?
  dst = Mat::zeros(H, W, CV_8U);
  uchar* dst_ptr = dst.ptr<uchar>();

#if CV_SSE2
  volatile bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#if CV_SSE3
  volatile bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);

  // Compute the similarity measure for this template by accumulating the contribution of
  // each feature
  for (int i = 0; i < (int)templ.features.size(); ++i)
    // Add the linear memory at the appropriate offset computed from the location of
    // the feature in the template
    Feature f = templ.features[i];
    // Discard feature if out of bounds
    /// @todo Shouldn't actually see x or y < 0 here?
    if (f.x < 0 || f.x >= size.width || f.y < 0 || f.y >= size.height)
    const uchar* lm_ptr = accessLinearMemory(linear_memories, f, T, W);

    // Now we do an aligned/unaligned add of dst_ptr and lm_ptr with template_positions elements
    int j = 0;
    // Process responses 16 at a time if vectorization possible
#if CV_SSE2
#if CV_SSE3
    if (haveSSE3)
      // LDDQU may be more efficient than MOVDQU for unaligned load of next 16 responses
      for ( ; j < template_positions - 15; j += 16)
        __m128i responses = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(lm_ptr + j));
        __m128i* dst_ptr_sse = reinterpret_cast<__m128i*>(dst_ptr + j);
        *dst_ptr_sse = _mm_add_epi8(*dst_ptr_sse, responses);
    if (haveSSE2)
      // Fall back to MOVDQU
      for ( ; j < template_positions - 15; j += 16)
        __m128i responses = _mm_loadu_si128(reinterpret_cast<const __m128i*>(lm_ptr + j));
        __m128i* dst_ptr_sse = reinterpret_cast<__m128i*>(dst_ptr + j);
        *dst_ptr_sse = _mm_add_epi8(*dst_ptr_sse, responses);
    for ( ; j < template_positions; ++j)
      dst_ptr[j] = uchar(dst_ptr[j] + lm_ptr[j]);

計算前首先根據模板的大小和圖像的大小計算需要平移的數目(Compute number of contiguous (in memory) pixels to check when sliding feature),其中accessLinearMemory可以直接獲得某個特定方向的整個線性化的Responsemap的響應大小,實際上這裏不涉及到模板窗口的平移操作,僅僅進行線性化尋址獲得響應值,再將所有特徵的響應值進行相加,就得到的匹配結果圖。對應的論文原理如圖:

因此在進行匹配結果計算時,結果圖中每個像素點代表的是模板的Anchor cell(錨點)在該位置的響應值,最大的情況是所有都匹配都是最大值4,而8bit的圖像最大值是255,這也是源碼中限制了63個特徵點的原因。


注意最後SSE加速訪問,+16是平移16個char內存空間,一個uchar內存空間爲8bit,所以正好平移了128bit的內存,對應着SSE訪問內存平移,在最後的不到128bit的內存中,採用非SSE方式進行補全。reinterpret_cast<const __m128i*>是強制轉換的,lm_ptr是一個char型指針。



