SSE Directshow 过滤器答案

【问题标题】：SSE Directshow filterSSE Directshow 过滤器
【发布时间】：2014-10-29 12:56:32
【问题描述】：

上下文

我制作了一个 directshow 过滤器来更改视频的对比度和亮度。我想加快速度。

没有 SSE 的工作过滤器

HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
    ...


    BYTE *pData;                // Pointer to the actual image buffer

    pMediaSample->GetPointer(&pData);

    int numPixels  = cxImage * cyImage;
    ...
    prgb = (RGBTRIPLE*) pData;

    for (int iPixel=0; iPixel < numPixels; iPixel++ ) {
       RGBTRIPLE *ppixel = prgb + iPixel;

       ppixel->rgbtGreen = ppixel->rgbtGreen * _contrastPower + _brightnessPower;
       ppixel->rgbtBlue  = ppixel->rgbtBlue  * _contrastPower + _brightnessPower;
       ppixel->rgbtRed   = ppixel->rgbtRed   * _contrastPower + _brightnessPower;  

       if(ppixel->rgbtGreen>255) ppixel->rgbtGreen = 255;
       if(ppixel->rgbtBlue>255)  ppixel->rgbtBlue  = 255;
       if(ppixel->rgbtRed>255)   ppixel->rgbtRed   = 255;
    }
    ...
}

SEE 过滤器无效

HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
    BYTE *pData;                // Pointer to the actual image buffer
    long lDataLen;              // Holds length of any given sample
    int iPixel;                 // Used to loop through the image pixels        
    RGBTRIPLE *prgb;            // Holds a pointer to the current pixel

    AM_MEDIA_TYPE* pType = &m_pInput->CurrentMediaType();
    VIDEOINFOHEADER *pvi = (VIDEOINFOHEADER *) pType->pbFormat;

    ASSERT(pvi);

    CheckPointer(pMediaSample,E_POINTER);
    pMediaSample->GetPointer(&pData);
    lDataLen = pMediaSample->GetSize();

    // Get the image properties from the BITMAPINFOHEADER

    int cxImage    = pvi->bmiHeader.biWidth;
    int cyImage    = pvi->bmiHeader.biHeight;
    int numPixels  = cxImage * cyImage;

    prgb = (RGBTRIPLE*) pData;

    double dcontrast = 0.7;

    __m128d cStore = _mm_set1_pd(dcontrast); 

    BYTE *pDataOutput = new BYTE[lDataLen];

    for (iPixel=0; iPixel < numPixels; iPixel += 4 ) {

        //unpack to 32 bits
        __m128i current = _mm_unpacklo_epi8( _mm_loadu_si128( (__m128i*)( prgb+iPixel ) ), _mm_setzero_si128());
        __m128d  image  = _mm_cvtepi32_pd(_mm_unpacklo_epi16(current, _mm_setzero_si128()));

        //vector operations
        __m128d result = _mm_mul_pd(cStore, image);

         //pack back to 8 bits
        __m128i pack_32 = _mm_cvtpd_epi32 (result); 
        __m128i pack_16 = _mm_packs_epi32 (pack_32, pack_32); 
        __m128i pack_8  = _mm_packus_epi16(pack_16, pack_16); 

        //store the new pixel in pDataOutput
        _mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);

        //also tryed to store the result in the original array
        //_mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8); // blacks out the whole video
    }

    //assign the original pointer to point at the start of the new data array       
    pData = pDataOutput;


    return NOERROR;
}

问题

这段代码对原始流没有任何作用：

//store the new pixel in pDataOutput
_mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);
....
pData = pDataOutput;

这段代码使整个视频变黑：

 _mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8);

问题

我是否正确使用了 SSE 指令？

如何将修改后的数据分配给原始媒体样本指针？

【问题讨论】：

使用双精度数进行算术几乎会消除 SIMD 的任何优势 - 理想情况下，您需要使用定点并使用可以避免的最窄数据类型。
@PaulR 似乎快了 4 倍
可能是因为原来的标量代码效率很低——你应该可以做得更好（假设你仍然想提高性能）。

标签： c++ directshow sse simd

【解决方案1】：

也许这个例子对你有用：

void Filter(const uint8_t * src, size_t width, size_t height, double contrast, double brightness,  uint8_t * dst)
{
    const int shift = 8;
    size_t size = width*height*3;
    __m128i _contrast16 = _mm_set1_epi16(int16_t(contrast*(1 << shift)));
    __m128i _brightness16 = _mm_set1_epi16(int16_t(brightness*(1 << shift)));
    for(size_t i = 0; i < size; i += sizeof(__m128i))
    {
        __m128i _src8 = _mm_load_si128((__m128i*)(src + i));
        __m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
        __m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
        __m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcLo16)), shift);
        __m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcHi16)), shift);
        _mm_store_si128((__m128i*)(dst + i), _mm_packus_epi16(_dstLo16, _dstHi16));
    }
}

如果对每个通道使用单独的系数：

inline void Filter(const uint8_t * src, const __m128i & contrastLo, const __m128i & contrastHi, 
    const __m128i & brightnessLo, const __m128i & brightnessHi, int shift, uint8_t * dst)
{
    __m128i _src8 = _mm_load_si128((__m128i*)src);
    __m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
    __m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
    __m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(brightnessLo, _mm_mullo_epi16(contrastLo, _srcLo16)), shift);
    __m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(brightnessHi, _mm_mullo_epi16(contrastHi, _srcHi16)), shift);
    _mm_store_si128((__m128i*)dst, _mm_packus_epi16(_dstLo16, _dstHi16));
}


void Filter(const uint8_t * src, size_t width, size_t height, double contrast[3], double brightness[3],  uint8_t * dst)
{
    const int shift = 8;
    size_t size = width*height*3;
    const int16_t 
        c0 = int16_t(contrast[0]*(1 << shift)), 
        c1 = int16_t(contrast[1]*(1 << shift)), 
        c2 = int16_t(contrast[2]*(1 << shift));
    const int16_t 
        b0 = int16_t(brightness[0]*(1 << shift)), 
        b1 = int16_t(brightness[1]*(1 << shift)), 
        b2 = int16_t(brightness[2]*(1 << shift));

    __m128i _contrast[3], _brightness[3];
    _contrast[0] = _mm_setr_epi16(c0, c1, c2, c0, c1, c2, c0, c1);
    _contrast[1] = _mm_setr_epi16(c2, c0, c1, c2, c0, c1, c2, c0);
    _contrast[2] = _mm_setr_epi16(c1, c2, c0, c1, c2, c0, c1, c2);
    _brightness[0] = _mm_setr_epi16(b0, b1, b2, b0, b1, b2, b0, b1);
    _brightness[1] = _mm_setr_epi16(b2, b0, b1, b2, b0, b1, b2, b0);
    _brightness[2] = _mm_setr_epi16(b1, b2, b0, b1, b2, b0, b1, b2);
    for(size_t i = 0; i < size;)
    {
        Filter(src + i, _contrast[0], _contrast[1], _brightness[0], _brightness[1], shift, dst + i);
        i += sizeof(__m128i);
        Filter(src + i, _contrast[2], _contrast[0], _brightness[2], _brightness[0], shift, dst + i);
        i += sizeof(__m128i);
        Filter(src + i, _contrast[1], _contrast[2], _brightness[1], _brightness[2], shift, dst + i);
        i += sizeof(__m128i);
    }
}

【讨论】：

它确实比我拥有的更好。但是一旦对比度超过0.5，即0.6,0.7，它就会改变各种像素颜色。
感谢您的新回答，但它给了我相同的输出。亮度适用于一些低值，如 -25 和 25，对比度适用于 0 到 0.5 的值。在 0.51 时，它开始修改原始像素的颜色，而不是在对比度接近 1.0 时变得更清晰
这是 int16 类型的溢出。您可以设置 shift = 7 或 6。
！现在它工作得很好，对于移位 7 或 6。假设我们移位 6，与移位 7 相比，我们是否丢失了最后一位？有什么不同？此外，亮度部分现在似乎比我的前置过滤器好得多！我的代码可能有类似的溢出问题。