added another implementation with different memory access pattern (incomplete)
respectecd cache line size
@@ -33,8 +33,9 @@ | ||
33 | 33 | size_t width = imageInfo.width; |
34 | 34 | size_t height = imageInfo.height; |
35 | 35 | assert(imageInfo.bitsPerSample == 8 && imageInfo.samplesPerPixel == 1); |
36 | - const size_t size = width * height; | |
37 | 36 | |
37 | + size_t lineSize = (width + 63) & (~63); | |
38 | + const size_t size = lineSize * height; | |
38 | 39 | // std::vector<unsigned char> in(size); |
39 | 40 | // std::vector<unsigned char> dest(size); |
40 | 41 | // std::vector<unsigned char> work(size); |
@@ -44,7 +45,7 @@ | ||
44 | 45 | unsigned char* pWork2 = (unsigned char*) _mm_malloc(size*2, 64); |
45 | 46 | |
46 | 47 | unsigned char palettes[256 * 4]; |
47 | - ReadImageData(fo, pSrc, width, palettes); | |
48 | + ReadImageData(fo, pSrc, lineSize, palettes); | |
48 | 49 | fclose(f); |
49 | 50 | |
50 | 51 | for (size_t i=0; i<size; ++i) { |
@@ -60,6 +61,7 @@ | ||
60 | 61 | // const size_t nThreads = 4; |
61 | 62 | #else |
62 | 63 | const size_t nThreads = si.dwNumberOfProcessors; |
64 | +// const size_t nThreads = 1; | |
63 | 65 | #endif |
64 | 66 | Threads<blur_1b::Parameter> threads; |
65 | 67 | threads.SetUp(nThreads); |
@@ -71,7 +73,7 @@ | ||
71 | 73 | pCommon.height = partHeight; |
72 | 74 | pCommon.srcLineOffsetBytes = |
73 | 75 | pCommon.workLineOffsetBytes = |
74 | - pCommon.destLineOffsetBytes = width; | |
76 | + pCommon.destLineOffsetBytes = lineSize; | |
75 | 77 | pCommon.radius = 1; |
76 | 78 | pCommon.iterationCount = 1; |
77 | 79 | std::vector<blur_1b::Parameter> params(nThreads); |
@@ -87,15 +89,15 @@ | ||
87 | 89 | p.pWork = pWork + i * partSize * 2; |
88 | 90 | p.pWork2 = pWork2 + i * partSize * 2; |
89 | 91 | p.pDest = pDest + i * partSize; |
90 | - p.pTotal = _mm_malloc(width * sizeof(int32_t), 64); | |
91 | - p.pModi = _mm_malloc(width * sizeof(int32_t), 64); | |
92 | + p.pTotal = _mm_malloc(lineSize * sizeof(int32_t), 64); | |
93 | + p.pModi = _mm_malloc(lineSize * sizeof(int32_t), 64); | |
92 | 94 | } |
93 | 95 | typedef void (*BlurFuncPtr)(const blur_1b::Parameter& p); |
94 | 96 | BlurFuncPtr ptrs[] = { |
95 | 97 | //blur_1b::test_1, |
96 | 98 | //blur_1b::test_2, |
97 | - blur_1b::test_3, | |
98 | - blur_1b::test_4, | |
99 | + //blur_1b::test_3, | |
100 | + //blur_1b::test_4, | |
99 | 101 | blur_1b::test_5_h, |
100 | 102 | blur_1b::test_5_v, |
101 | 103 | blur_1b::test_5_h, |
@@ -107,6 +109,7 @@ | ||
107 | 109 | blur_1b::test_10, |
108 | 110 | blur_1b::test_11, |
109 | 111 | blur_1b::test_12, |
112 | + blur_1b::test_13, | |
110 | 113 | |
111 | 114 | //blur_1b::test_21, |
112 | 115 | //blur_1b::test_22, |
@@ -115,7 +118,7 @@ | ||
115 | 118 | Timer t; |
116 | 119 | Sym sym; |
117 | 120 | |
118 | - printf("%d %d %p\n", width, height, pDest); | |
121 | + printf("%d %d %d %p\n", width, height, lineSize, pDest); | |
119 | 122 | |
120 | 123 | for (size_t i=0; i<countof(ptrs); ++i) { |
121 | 124 | t.Start(); |
@@ -1866,7 +1866,7 @@ | ||
1866 | 1866 | } |
1867 | 1867 | |
1868 | 1868 | // vMiddle |
1869 | - for (size_t y=vRad; y<vCount-vRad; ++y) { | |
1869 | + for (size_t y=vRad; y<vCount-vLen; ++y) { | |
1870 | 1870 | |
1871 | 1871 | const uint8_t* hMinus = hLine; |
1872 | 1872 | const uint8_t* hPlus = hLine+hLen; |
@@ -2069,7 +2069,7 @@ | ||
2069 | 2069 | __m128i* mvSumLine = (__m128i*)vSumLine; |
2070 | 2070 | |
2071 | 2071 | // vMiddle |
2072 | - for (size_t y=vRad; y<vCount-vRad; ++y) { | |
2072 | + for (size_t y=vRad; y<vCount-vLen; ++y) { | |
2073 | 2073 | |
2074 | 2074 | assert((ptrdiff_t)hLine % 16 == 0); |
2075 | 2075 |
@@ -2121,8 +2121,150 @@ | ||
2121 | 2121 | |
2122 | 2122 | } |
2123 | 2123 | |
2124 | +void test_13(const Parameter& p) { | |
2125 | + | |
2126 | + BLUR_EXTRACT_PARAMS; | |
2124 | 2127 | |
2128 | + uint32_t hRad = p.radius; | |
2129 | + uint32_t vRad = p.radius; | |
2130 | + uint32_t hLen = 1 + hRad*2; | |
2131 | + uint32_t vLen = 1 + vRad*2; | |
2132 | + uint32_t invLen = 0xFFFFFF / (hLen*vLen); | |
2133 | + uint32_t hCount = p.width; | |
2134 | + uint32_t vCount = p.height; | |
2125 | 2135 | |
2136 | + static const __m128i mInvRatio = _mm_set1_epi16(0xFFFF / 9); | |
2137 | + | |
2138 | + if (vRad != 1) { | |
2139 | + return; | |
2140 | + } | |
2141 | + | |
2142 | + const uint8_t* hLine = p.pSrc; | |
2143 | + uint8_t* vLine = p.pDest; | |
2144 | + OffsetPtr(vLine, destLineOffsetBytes * vRad); | |
2145 | + | |
2146 | + uint16_t* vSumLine = (uint16_t*)pWork2; | |
2147 | + assert((ptrdiff_t)vSumLine % 16 == 0); | |
2148 | + assert((width * 2) % 16 == 0); | |
2149 | + | |
2150 | + __m128i* remains = (__m128i*)p.pWork; | |
2151 | +#if 1 | |
2152 | + const __m128i* mpSrc = (const __m128i*)pSrc; | |
2153 | + __m128i* mpDst = (__m128i*)pDest; | |
2154 | + for (size_t i=0; i<hCount/64; ++i) { | |
2155 | + const __m128i* src = mpSrc; | |
2156 | + __m128i* dst = mpDst; | |
2157 | + __m128i sums0 = _mm_setzero_si128(); | |
2158 | + __m128i sums1 = _mm_setzero_si128(); | |
2159 | + __m128i sums2 = _mm_setzero_si128(); | |
2160 | + __m128i sums3 = _mm_setzero_si128(); | |
2161 | + __m128i sums4 = _mm_setzero_si128(); | |
2162 | + __m128i sums5 = _mm_setzero_si128(); | |
2163 | + __m128i sums6 = _mm_setzero_si128(); | |
2164 | + __m128i sums7 = _mm_setzero_si128(); | |
2165 | + __m128i adds0 = _mm_setzero_si128(); | |
2166 | + __m128i adds1 = _mm_setzero_si128(); | |
2167 | + __m128i adds2 = _mm_setzero_si128(); | |
2168 | + __m128i adds3 = _mm_setzero_si128(); | |
2169 | + __m128i adds4 = _mm_setzero_si128(); | |
2170 | + __m128i adds5 = _mm_setzero_si128(); | |
2171 | + __m128i adds6 = _mm_setzero_si128(); | |
2172 | + __m128i adds7 = _mm_setzero_si128(); | |
2173 | + __m128i mids0 = _mm_setzero_si128(); | |
2174 | + __m128i mids1 = _mm_setzero_si128(); | |
2175 | + __m128i mids2 = _mm_setzero_si128(); | |
2176 | + __m128i mids3 = _mm_setzero_si128(); | |
2177 | + __m128i mids4 = _mm_setzero_si128(); | |
2178 | + __m128i mids5 = _mm_setzero_si128(); | |
2179 | + __m128i mids6 = _mm_setzero_si128(); | |
2180 | + __m128i mids7 = _mm_setzero_si128(); | |
2181 | + __m128i remain0 = _mm_setzero_si128(); | |
2182 | + __m128i remain1 = _mm_setzero_si128(); | |
2183 | + __m128i remain2 = _mm_setzero_si128(); | |
2184 | + __m128i nsrc0 = src[0]; | |
2185 | + __m128i nsrc1 = src[1]; | |
2186 | + __m128i nsrc2 = src[2]; | |
2187 | + __m128i nsrc3 = src[3]; | |
2188 | + for (size_t y=0; y<vCount; ++y) { | |
2189 | + sums0 = _mm_sub_epi16(sums0, mids0); | |
2190 | + sums1 = _mm_sub_epi16(sums1, mids1); | |
2191 | + sums2 = _mm_sub_epi16(sums2, mids2); | |
2192 | + sums3 = _mm_sub_epi16(sums3, mids3); | |
2193 | + sums4 = _mm_sub_epi16(sums4, mids4); | |
2194 | + sums5 = _mm_sub_epi16(sums5, mids5); | |
2195 | + sums6 = _mm_sub_epi16(sums6, mids6); | |
2196 | + sums7 = _mm_sub_epi16(sums7, mids7); | |
2197 | + | |
2198 | + mids0 = adds0; | |
2199 | + mids1 = adds1; | |
2200 | + mids2 = adds2; | |
2201 | + mids3 = adds3; | |
2202 | + mids4 = adds4; | |
2203 | + mids5 = adds5; | |
2204 | + mids6 = adds6; | |
2205 | + mids7 = adds7; | |
2206 | + | |
2207 | + __m128i src0 = nsrc0; | |
2208 | + __m128i src1 = nsrc1; | |
2209 | + __m128i src2 = nsrc2; | |
2210 | + __m128i src3 = nsrc3; | |
2211 | + nsrc0 = src[4]; | |
2212 | + nsrc1 = src[5]; | |
2213 | + nsrc2 = src[6]; | |
2214 | + nsrc3 = src[7]; | |
2215 | + | |
2216 | + __m128i remain = remains[y]; | |
2217 | + repeatShiftSum3(src0, adds0, adds1, remain0); | |
2218 | + adds0 = _mm_add_epi16(adds0, remain); | |
2219 | + repeatShiftSum3(src1, adds2, adds3, remain1); | |
2220 | + repeatShiftSum3(src2, adds4, adds5, remain2); | |
2221 | + repeatShiftSum3(src3, adds6, adds7, remain); | |
2222 | + remains[y] = remain; | |
2223 | + adds2 = _mm_add_epi16(adds2, remain0); | |
2224 | + adds4 = _mm_add_epi16(adds4, remain1); | |
2225 | + adds6 = _mm_add_epi16(adds6, remain2); | |
2226 | + | |
2227 | + sums0 = _mm_add_epi16(sums0, adds0); | |
2228 | + sums1 = _mm_add_epi16(sums1, adds1); | |
2229 | + sums2 = _mm_add_epi16(sums2, adds2); | |
2230 | + sums3 = _mm_add_epi16(sums3, adds3); | |
2231 | + sums4 = _mm_add_epi16(sums4, adds4); | |
2232 | + sums5 = _mm_add_epi16(sums5, adds5); | |
2233 | + sums6 = _mm_add_epi16(sums6, adds6); | |
2234 | + sums7 = _mm_add_epi16(sums7, adds7); | |
2235 | + | |
2236 | + __m128i result0 = _mm_packus_epi16(_mm_mulhi_epu16(sums0, mInvRatio), _mm_mulhi_epu16(sums1, mInvRatio)); | |
2237 | + _mm_stream_si128(dst+0, result0); | |
2238 | + __m128i result1 = _mm_packus_epi16(_mm_mulhi_epu16(sums2, mInvRatio), _mm_mulhi_epu16(sums3, mInvRatio)); | |
2239 | + _mm_stream_si128(dst+1, result1); | |
2240 | + __m128i result2 = _mm_packus_epi16(_mm_mulhi_epu16(sums4, mInvRatio), _mm_mulhi_epu16(sums5, mInvRatio)); | |
2241 | + _mm_stream_si128(dst+2, result2); | |
2242 | + __m128i result3 = _mm_packus_epi16(_mm_mulhi_epu16(sums6, mInvRatio), _mm_mulhi_epu16(sums7, mInvRatio)); | |
2243 | + _mm_stream_si128(dst+3, result3); | |
2244 | + OffsetPtr(dst, destLineOffsetBytes); | |
2245 | + OffsetPtr(src, destLineOffsetBytes); | |
2246 | + } | |
2247 | + mpSrc += 4; | |
2248 | + mpDst += 4; | |
2249 | + } | |
2250 | +#else | |
2251 | + const __m128i* mpSrc = (const __m128i*)&pSrc[0]; | |
2252 | + __m128i* mpDst = (__m128i*)&pDest[0]; | |
2253 | + for (size_t y=0; y<vCount; ++y) { | |
2254 | + for (size_t i=0; i<hCount/64; ++i) { | |
2255 | + size_t x = i*4; | |
2256 | + _mm_stream_si128(mpDst+x+0, mpSrc[x+0]); | |
2257 | + _mm_stream_si128(mpDst+x+1, mpSrc[x+1]); | |
2258 | + _mm_stream_si128(mpDst+x+2, mpSrc[x+2]); | |
2259 | + _mm_stream_si128(mpDst+x+3, mpSrc[x+3]); | |
2260 | + } | |
2261 | + OffsetPtr(mpDst, destLineOffsetBytes); | |
2262 | + OffsetPtr(mpSrc, destLineOffsetBytes); | |
2263 | + } | |
2264 | +#endif | |
2265 | + | |
2266 | +} | |
2267 | + | |
2126 | 2268 | void test_20(const Parameter& p) { |
2127 | 2269 | |
2128 | 2270 | BLUR_EXTRACT_PARAMS; |
@@ -44,6 +44,7 @@ | ||
44 | 44 | void test_10(const Parameter& p); // test_9 SSE optimization |
45 | 45 | void test_11(const Parameter& p); // fused horizontal & vertical computation |
46 | 46 | void test_12(const Parameter& p); // test_11 SSE optimization |
47 | +void test_13(const Parameter& p); // test_11 SSE optimization | |
47 | 48 | |
48 | 49 | // TentFilter |
49 | 50 | void test_20(const Parameter& p); // C implementation |