• R/O
  • SSH
  • HTTPS

imagefilter: Commit


Commit MetaInfo

Révision26 (tree)
l'heure2013-07-12 09:52:24
Auteurberupon

Message de Log

added another implementation with different memory access pattern (incomplete)
respectecd cache line size

Change Summary

Modification

--- trunk/main.cpp (revision 25)
+++ trunk/main.cpp (revision 26)
@@ -33,8 +33,9 @@
3333 size_t width = imageInfo.width;
3434 size_t height = imageInfo.height;
3535 assert(imageInfo.bitsPerSample == 8 && imageInfo.samplesPerPixel == 1);
36- const size_t size = width * height;
3736
37+ size_t lineSize = (width + 63) & (~63);
38+ const size_t size = lineSize * height;
3839 // std::vector<unsigned char> in(size);
3940 // std::vector<unsigned char> dest(size);
4041 // std::vector<unsigned char> work(size);
@@ -44,7 +45,7 @@
4445 unsigned char* pWork2 = (unsigned char*) _mm_malloc(size*2, 64);
4546
4647 unsigned char palettes[256 * 4];
47- ReadImageData(fo, pSrc, width, palettes);
48+ ReadImageData(fo, pSrc, lineSize, palettes);
4849 fclose(f);
4950
5051 for (size_t i=0; i<size; ++i) {
@@ -60,6 +61,7 @@
6061 // const size_t nThreads = 4;
6162 #else
6263 const size_t nThreads = si.dwNumberOfProcessors;
64+// const size_t nThreads = 1;
6365 #endif
6466 Threads<blur_1b::Parameter> threads;
6567 threads.SetUp(nThreads);
@@ -71,7 +73,7 @@
7173 pCommon.height = partHeight;
7274 pCommon.srcLineOffsetBytes =
7375 pCommon.workLineOffsetBytes =
74- pCommon.destLineOffsetBytes = width;
76+ pCommon.destLineOffsetBytes = lineSize;
7577 pCommon.radius = 1;
7678 pCommon.iterationCount = 1;
7779 std::vector<blur_1b::Parameter> params(nThreads);
@@ -87,15 +89,15 @@
8789 p.pWork = pWork + i * partSize * 2;
8890 p.pWork2 = pWork2 + i * partSize * 2;
8991 p.pDest = pDest + i * partSize;
90- p.pTotal = _mm_malloc(width * sizeof(int32_t), 64);
91- p.pModi = _mm_malloc(width * sizeof(int32_t), 64);
92+ p.pTotal = _mm_malloc(lineSize * sizeof(int32_t), 64);
93+ p.pModi = _mm_malloc(lineSize * sizeof(int32_t), 64);
9294 }
9395 typedef void (*BlurFuncPtr)(const blur_1b::Parameter& p);
9496 BlurFuncPtr ptrs[] = {
9597 //blur_1b::test_1,
9698 //blur_1b::test_2,
97- blur_1b::test_3,
98- blur_1b::test_4,
99+ //blur_1b::test_3,
100+ //blur_1b::test_4,
99101 blur_1b::test_5_h,
100102 blur_1b::test_5_v,
101103 blur_1b::test_5_h,
@@ -107,6 +109,7 @@
107109 blur_1b::test_10,
108110 blur_1b::test_11,
109111 blur_1b::test_12,
112+ blur_1b::test_13,
110113
111114 //blur_1b::test_21,
112115 //blur_1b::test_22,
@@ -115,7 +118,7 @@
115118 Timer t;
116119 Sym sym;
117120
118- printf("%d %d %p\n", width, height, pDest);
121+ printf("%d %d %d %p\n", width, height, lineSize, pDest);
119122
120123 for (size_t i=0; i<countof(ptrs); ++i) {
121124 t.Start();
--- trunk/blur_1b.cpp (revision 25)
+++ trunk/blur_1b.cpp (revision 26)
@@ -1866,7 +1866,7 @@
18661866 }
18671867
18681868 // vMiddle
1869- for (size_t y=vRad; y<vCount-vRad; ++y) {
1869+ for (size_t y=vRad; y<vCount-vLen; ++y) {
18701870
18711871 const uint8_t* hMinus = hLine;
18721872 const uint8_t* hPlus = hLine+hLen;
@@ -2069,7 +2069,7 @@
20692069 __m128i* mvSumLine = (__m128i*)vSumLine;
20702070
20712071 // vMiddle
2072- for (size_t y=vRad; y<vCount-vRad; ++y) {
2072+ for (size_t y=vRad; y<vCount-vLen; ++y) {
20732073
20742074 assert((ptrdiff_t)hLine % 16 == 0);
20752075
@@ -2121,8 +2121,150 @@
21212121
21222122 }
21232123
2124+void test_13(const Parameter& p) {
2125+
2126+ BLUR_EXTRACT_PARAMS;
21242127
2128+ uint32_t hRad = p.radius;
2129+ uint32_t vRad = p.radius;
2130+ uint32_t hLen = 1 + hRad*2;
2131+ uint32_t vLen = 1 + vRad*2;
2132+ uint32_t invLen = 0xFFFFFF / (hLen*vLen);
2133+ uint32_t hCount = p.width;
2134+ uint32_t vCount = p.height;
21252135
2136+ static const __m128i mInvRatio = _mm_set1_epi16(0xFFFF / 9);
2137+
2138+ if (vRad != 1) {
2139+ return;
2140+ }
2141+
2142+ const uint8_t* hLine = p.pSrc;
2143+ uint8_t* vLine = p.pDest;
2144+ OffsetPtr(vLine, destLineOffsetBytes * vRad);
2145+
2146+ uint16_t* vSumLine = (uint16_t*)pWork2;
2147+ assert((ptrdiff_t)vSumLine % 16 == 0);
2148+ assert((width * 2) % 16 == 0);
2149+
2150+ __m128i* remains = (__m128i*)p.pWork;
2151+#if 1
2152+ const __m128i* mpSrc = (const __m128i*)pSrc;
2153+ __m128i* mpDst = (__m128i*)pDest;
2154+ for (size_t i=0; i<hCount/64; ++i) {
2155+ const __m128i* src = mpSrc;
2156+ __m128i* dst = mpDst;
2157+ __m128i sums0 = _mm_setzero_si128();
2158+ __m128i sums1 = _mm_setzero_si128();
2159+ __m128i sums2 = _mm_setzero_si128();
2160+ __m128i sums3 = _mm_setzero_si128();
2161+ __m128i sums4 = _mm_setzero_si128();
2162+ __m128i sums5 = _mm_setzero_si128();
2163+ __m128i sums6 = _mm_setzero_si128();
2164+ __m128i sums7 = _mm_setzero_si128();
2165+ __m128i adds0 = _mm_setzero_si128();
2166+ __m128i adds1 = _mm_setzero_si128();
2167+ __m128i adds2 = _mm_setzero_si128();
2168+ __m128i adds3 = _mm_setzero_si128();
2169+ __m128i adds4 = _mm_setzero_si128();
2170+ __m128i adds5 = _mm_setzero_si128();
2171+ __m128i adds6 = _mm_setzero_si128();
2172+ __m128i adds7 = _mm_setzero_si128();
2173+ __m128i mids0 = _mm_setzero_si128();
2174+ __m128i mids1 = _mm_setzero_si128();
2175+ __m128i mids2 = _mm_setzero_si128();
2176+ __m128i mids3 = _mm_setzero_si128();
2177+ __m128i mids4 = _mm_setzero_si128();
2178+ __m128i mids5 = _mm_setzero_si128();
2179+ __m128i mids6 = _mm_setzero_si128();
2180+ __m128i mids7 = _mm_setzero_si128();
2181+ __m128i remain0 = _mm_setzero_si128();
2182+ __m128i remain1 = _mm_setzero_si128();
2183+ __m128i remain2 = _mm_setzero_si128();
2184+ __m128i nsrc0 = src[0];
2185+ __m128i nsrc1 = src[1];
2186+ __m128i nsrc2 = src[2];
2187+ __m128i nsrc3 = src[3];
2188+ for (size_t y=0; y<vCount; ++y) {
2189+ sums0 = _mm_sub_epi16(sums0, mids0);
2190+ sums1 = _mm_sub_epi16(sums1, mids1);
2191+ sums2 = _mm_sub_epi16(sums2, mids2);
2192+ sums3 = _mm_sub_epi16(sums3, mids3);
2193+ sums4 = _mm_sub_epi16(sums4, mids4);
2194+ sums5 = _mm_sub_epi16(sums5, mids5);
2195+ sums6 = _mm_sub_epi16(sums6, mids6);
2196+ sums7 = _mm_sub_epi16(sums7, mids7);
2197+
2198+ mids0 = adds0;
2199+ mids1 = adds1;
2200+ mids2 = adds2;
2201+ mids3 = adds3;
2202+ mids4 = adds4;
2203+ mids5 = adds5;
2204+ mids6 = adds6;
2205+ mids7 = adds7;
2206+
2207+ __m128i src0 = nsrc0;
2208+ __m128i src1 = nsrc1;
2209+ __m128i src2 = nsrc2;
2210+ __m128i src3 = nsrc3;
2211+ nsrc0 = src[4];
2212+ nsrc1 = src[5];
2213+ nsrc2 = src[6];
2214+ nsrc3 = src[7];
2215+
2216+ __m128i remain = remains[y];
2217+ repeatShiftSum3(src0, adds0, adds1, remain0);
2218+ adds0 = _mm_add_epi16(adds0, remain);
2219+ repeatShiftSum3(src1, adds2, adds3, remain1);
2220+ repeatShiftSum3(src2, adds4, adds5, remain2);
2221+ repeatShiftSum3(src3, adds6, adds7, remain);
2222+ remains[y] = remain;
2223+ adds2 = _mm_add_epi16(adds2, remain0);
2224+ adds4 = _mm_add_epi16(adds4, remain1);
2225+ adds6 = _mm_add_epi16(adds6, remain2);
2226+
2227+ sums0 = _mm_add_epi16(sums0, adds0);
2228+ sums1 = _mm_add_epi16(sums1, adds1);
2229+ sums2 = _mm_add_epi16(sums2, adds2);
2230+ sums3 = _mm_add_epi16(sums3, adds3);
2231+ sums4 = _mm_add_epi16(sums4, adds4);
2232+ sums5 = _mm_add_epi16(sums5, adds5);
2233+ sums6 = _mm_add_epi16(sums6, adds6);
2234+ sums7 = _mm_add_epi16(sums7, adds7);
2235+
2236+ __m128i result0 = _mm_packus_epi16(_mm_mulhi_epu16(sums0, mInvRatio), _mm_mulhi_epu16(sums1, mInvRatio));
2237+ _mm_stream_si128(dst+0, result0);
2238+ __m128i result1 = _mm_packus_epi16(_mm_mulhi_epu16(sums2, mInvRatio), _mm_mulhi_epu16(sums3, mInvRatio));
2239+ _mm_stream_si128(dst+1, result1);
2240+ __m128i result2 = _mm_packus_epi16(_mm_mulhi_epu16(sums4, mInvRatio), _mm_mulhi_epu16(sums5, mInvRatio));
2241+ _mm_stream_si128(dst+2, result2);
2242+ __m128i result3 = _mm_packus_epi16(_mm_mulhi_epu16(sums6, mInvRatio), _mm_mulhi_epu16(sums7, mInvRatio));
2243+ _mm_stream_si128(dst+3, result3);
2244+ OffsetPtr(dst, destLineOffsetBytes);
2245+ OffsetPtr(src, destLineOffsetBytes);
2246+ }
2247+ mpSrc += 4;
2248+ mpDst += 4;
2249+ }
2250+#else
2251+ const __m128i* mpSrc = (const __m128i*)&pSrc[0];
2252+ __m128i* mpDst = (__m128i*)&pDest[0];
2253+ for (size_t y=0; y<vCount; ++y) {
2254+ for (size_t i=0; i<hCount/64; ++i) {
2255+ size_t x = i*4;
2256+ _mm_stream_si128(mpDst+x+0, mpSrc[x+0]);
2257+ _mm_stream_si128(mpDst+x+1, mpSrc[x+1]);
2258+ _mm_stream_si128(mpDst+x+2, mpSrc[x+2]);
2259+ _mm_stream_si128(mpDst+x+3, mpSrc[x+3]);
2260+ }
2261+ OffsetPtr(mpDst, destLineOffsetBytes);
2262+ OffsetPtr(mpSrc, destLineOffsetBytes);
2263+ }
2264+#endif
2265+
2266+}
2267+
21262268 void test_20(const Parameter& p) {
21272269
21282270 BLUR_EXTRACT_PARAMS;
--- trunk/blur_1b.h (revision 25)
+++ trunk/blur_1b.h (revision 26)
@@ -44,6 +44,7 @@
4444 void test_10(const Parameter& p); // test_9 SSE optimization
4545 void test_11(const Parameter& p); // fused horizontal & vertical computation
4646 void test_12(const Parameter& p); // test_11 SSE optimization
47+void test_13(const Parameter& p); // test_11 SSE optimization
4748
4849 // TentFilter
4950 void test_20(const Parameter& p); // C implementation
Afficher sur ancien navigateur de dépôt.