【问题标题】:SSE Directshow filterSSE Directshow 过滤器
【发布时间】:2014-10-29 12:56:32
【问题描述】:

上下文

我制作了一个 directshow 过滤器来更改视频的对比度和亮度。我想加快速度。

没有 SSE 的工作过滤器

HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
    ...


    BYTE *pData;                // Pointer to the actual image buffer

    pMediaSample->GetPointer(&pData);

    int numPixels  = cxImage * cyImage;
    ...
    prgb = (RGBTRIPLE*) pData;

    for (int iPixel=0; iPixel < numPixels; iPixel++ ) {
       RGBTRIPLE *ppixel = prgb + iPixel;

       ppixel->rgbtGreen = ppixel->rgbtGreen * _contrastPower + _brightnessPower;
       ppixel->rgbtBlue  = ppixel->rgbtBlue  * _contrastPower + _brightnessPower;
       ppixel->rgbtRed   = ppixel->rgbtRed   * _contrastPower + _brightnessPower;  

       if(ppixel->rgbtGreen>255) ppixel->rgbtGreen = 255;
       if(ppixel->rgbtBlue>255)  ppixel->rgbtBlue  = 255;
       if(ppixel->rgbtRed>255)   ppixel->rgbtRed   = 255;
    }
    ...
}

SEE 过滤器无效

HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
    BYTE *pData;                // Pointer to the actual image buffer
    long lDataLen;              // Holds length of any given sample
    int iPixel;                 // Used to loop through the image pixels        
    RGBTRIPLE *prgb;            // Holds a pointer to the current pixel

    AM_MEDIA_TYPE* pType = &m_pInput->CurrentMediaType();
    VIDEOINFOHEADER *pvi = (VIDEOINFOHEADER *) pType->pbFormat;

    ASSERT(pvi);

    CheckPointer(pMediaSample,E_POINTER);
    pMediaSample->GetPointer(&pData);
    lDataLen = pMediaSample->GetSize();

    // Get the image properties from the BITMAPINFOHEADER

    int cxImage    = pvi->bmiHeader.biWidth;
    int cyImage    = pvi->bmiHeader.biHeight;
    int numPixels  = cxImage * cyImage;

    prgb = (RGBTRIPLE*) pData;

    double dcontrast = 0.7;

    __m128d cStore = _mm_set1_pd(dcontrast); 

    BYTE *pDataOutput = new BYTE[lDataLen];

    for (iPixel=0; iPixel < numPixels; iPixel += 4 ) {

        //unpack to 32 bits
        __m128i current = _mm_unpacklo_epi8( _mm_loadu_si128( (__m128i*)( prgb+iPixel ) ), _mm_setzero_si128());
        __m128d  image  = _mm_cvtepi32_pd(_mm_unpacklo_epi16(current, _mm_setzero_si128()));

        //vector operations
        __m128d result = _mm_mul_pd(cStore, image);

         //pack back to 8 bits
        __m128i pack_32 = _mm_cvtpd_epi32 (result); 
        __m128i pack_16 = _mm_packs_epi32 (pack_32, pack_32); 
        __m128i pack_8  = _mm_packus_epi16(pack_16, pack_16); 

        //store the new pixel in pDataOutput
        _mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);

        //also tryed to store the result in the original array
        //_mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8); // blacks out the whole video
    }

    //assign the original pointer to point at the start of the new data array       
    pData = pDataOutput;


    return NOERROR;
}

问题

这段代码对原始流没有任何作用:

//store the new pixel in pDataOutput
_mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);
....
pData = pDataOutput;

这段代码使整个视频变黑:

 _mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8);

问题

我是否正确使用了 SSE 指令?

如何将修改后的数据分配给原始媒体样本指针?

【问题讨论】:

  • 使用双精度数进行算术几乎会消除 SIMD 的任何优势 - 理想情况下,您需要使用定点并使用可以避免的最窄数据类型。
  • @PaulR 似乎快了 4 倍
  • 可能是因为原来的标量代码效率很低——你应该可以做得更好(假设你仍然想提高性能)。

标签: c++ directshow sse simd


【解决方案1】:

也许这个例子对你有用:

void Filter(const uint8_t * src, size_t width, size_t height, double contrast, double brightness,  uint8_t * dst)
{
    const int shift = 8;
    size_t size = width*height*3;
    __m128i _contrast16 = _mm_set1_epi16(int16_t(contrast*(1 << shift)));
    __m128i _brightness16 = _mm_set1_epi16(int16_t(brightness*(1 << shift)));
    for(size_t i = 0; i < size; i += sizeof(__m128i))
    {
        __m128i _src8 = _mm_load_si128((__m128i*)(src + i));
        __m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
        __m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
        __m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcLo16)), shift);
        __m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcHi16)), shift);
        _mm_store_si128((__m128i*)(dst + i), _mm_packus_epi16(_dstLo16, _dstHi16));
    }
}

如果对每个通道使用单独的系数:

inline void Filter(const uint8_t * src, const __m128i & contrastLo, const __m128i & contrastHi, 
    const __m128i & brightnessLo, const __m128i & brightnessHi, int shift, uint8_t * dst)
{
    __m128i _src8 = _mm_load_si128((__m128i*)src);
    __m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
    __m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
    __m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(brightnessLo, _mm_mullo_epi16(contrastLo, _srcLo16)), shift);
    __m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(brightnessHi, _mm_mullo_epi16(contrastHi, _srcHi16)), shift);
    _mm_store_si128((__m128i*)dst, _mm_packus_epi16(_dstLo16, _dstHi16));
}


void Filter(const uint8_t * src, size_t width, size_t height, double contrast[3], double brightness[3],  uint8_t * dst)
{
    const int shift = 8;
    size_t size = width*height*3;
    const int16_t 
        c0 = int16_t(contrast[0]*(1 << shift)), 
        c1 = int16_t(contrast[1]*(1 << shift)), 
        c2 = int16_t(contrast[2]*(1 << shift));
    const int16_t 
        b0 = int16_t(brightness[0]*(1 << shift)), 
        b1 = int16_t(brightness[1]*(1 << shift)), 
        b2 = int16_t(brightness[2]*(1 << shift));

    __m128i _contrast[3], _brightness[3];
    _contrast[0] = _mm_setr_epi16(c0, c1, c2, c0, c1, c2, c0, c1);
    _contrast[1] = _mm_setr_epi16(c2, c0, c1, c2, c0, c1, c2, c0);
    _contrast[2] = _mm_setr_epi16(c1, c2, c0, c1, c2, c0, c1, c2);
    _brightness[0] = _mm_setr_epi16(b0, b1, b2, b0, b1, b2, b0, b1);
    _brightness[1] = _mm_setr_epi16(b2, b0, b1, b2, b0, b1, b2, b0);
    _brightness[2] = _mm_setr_epi16(b1, b2, b0, b1, b2, b0, b1, b2);
    for(size_t i = 0; i < size;)
    {
        Filter(src + i, _contrast[0], _contrast[1], _brightness[0], _brightness[1], shift, dst + i);
        i += sizeof(__m128i);
        Filter(src + i, _contrast[2], _contrast[0], _brightness[2], _brightness[0], shift, dst + i);
        i += sizeof(__m128i);
        Filter(src + i, _contrast[1], _contrast[2], _brightness[1], _brightness[2], shift, dst + i);
        i += sizeof(__m128i);
    }
}

【讨论】:

  • 它确实比我拥有的更好。但是一旦对比度超过0.5,即0.6,0.7,它就会改变各种像素颜色。
  • 感谢您的新回答,但它给了我相同的输出。亮度适用于一些低值,如 -25 和 25,对比度适用于 0 到 0.5 的值。在 0.51 时,它开始修改原始像素的颜色,而不是在对比度接近 1.0 时变得更清晰
  • 这是 int16 类型的溢出。您可以设置 shift = 7 或 6。
  • !现在它工作得很好,对于移位 7 或 6。假设我们移位 6,与移位 7 相比,我们是否丢失了最后一位?有什么不同 ?此外,亮度部分现在似乎比我的前置过滤器好得多!我的代码可能有类似的溢出问题。
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2023-03-15
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2020-04-18
相关资源
最近更新 更多