ROOT  6.06/09
Reference Guide
avx_sorthelper.cpp
Go to the documentation of this file.
1 /* This file is part of the Vc library.
2 
3  Copyright (C) 2011 Matthias Kretz <kretz@kde.org>
4 
5  Vc is free software: you can redistribute it and/or modify
6  it under the terms of the GNU Lesser General Public License as
7  published by the Free Software Foundation, either version 3 of
8  the License, or (at your option) any later version.
9 
10  Vc is distributed in the hope that it will be useful, but
11  WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public
16  License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17 
18 */
19 
20 #include <Vc/avx/intrinsics.h>
21 #include <Vc/avx/casts.h>
22 #include <Vc/avx/sorthelper.h>
23 #include <Vc/avx/macros.h>
24 
25 namespace ROOT {
26 namespace Vc
27 {
28 namespace AVX
29 {
30 
32 {
33  m128i lo, hi, y, x = _x;
34  // sort pairs
35  y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
36  lo = _mm_min_epi16(x, y);
37  hi = _mm_max_epi16(x, y);
38  x = _mm_blend_epi16(lo, hi, 0xaa);
39 
40  // merge left and right quads
41  y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3));
42  lo = _mm_min_epi16(x, y);
43  hi = _mm_max_epi16(x, y);
44  x = _mm_blend_epi16(lo, hi, 0xcc);
45  y = _mm_srli_si128(x, 2);
46  lo = _mm_min_epi16(x, y);
47  hi = _mm_max_epi16(x, y);
48  x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa);
49 
50  // merge quads into octs
51  y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
52  y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3));
53  lo = _mm_min_epi16(x, y);
54  hi = _mm_max_epi16(x, y);
55 
56  x = _mm_unpacklo_epi16(lo, hi);
57  y = _mm_srli_si128(x, 8);
58  lo = _mm_min_epi16(x, y);
59  hi = _mm_max_epi16(x, y);
60 
61  x = _mm_unpacklo_epi16(lo, hi);
62  y = _mm_srli_si128(x, 8);
63  lo = _mm_min_epi16(x, y);
64  hi = _mm_max_epi16(x, y);
65 
66  return _mm_unpacklo_epi16(lo, hi);
67 }
69 {
70  m128i lo, hi, y, x = _x;
71  // sort pairs
72  y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
73  lo = _mm_min_epu16(x, y);
74  hi = _mm_max_epu16(x, y);
75  x = _mm_blend_epi16(lo, hi, 0xaa);
76 
77  // merge left and right quads
78  y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3));
79  lo = _mm_min_epu16(x, y);
80  hi = _mm_max_epu16(x, y);
81  x = _mm_blend_epi16(lo, hi, 0xcc);
82  y = _mm_srli_si128(x, 2);
83  lo = _mm_min_epu16(x, y);
84  hi = _mm_max_epu16(x, y);
85  x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa);
86 
87  // merge quads into octs
88  y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
89  y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3));
90  lo = _mm_min_epu16(x, y);
91  hi = _mm_max_epu16(x, y);
92 
93  x = _mm_unpacklo_epi16(lo, hi);
94  y = _mm_srli_si128(x, 8);
95  lo = _mm_min_epu16(x, y);
96  hi = _mm_max_epu16(x, y);
97 
98  x = _mm_unpacklo_epi16(lo, hi);
99  y = _mm_srli_si128(x, 8);
100  lo = _mm_min_epu16(x, y);
101  hi = _mm_max_epu16(x, y);
102 
103  return _mm_unpacklo_epi16(lo, hi);
104 }
105 
106 template<> m256i SortHelper<int>::sort(VTArg _hgfedcba)
107 {
108  VectorType hgfedcba = _hgfedcba;
109  const m128i hgfe = hi128(hgfedcba);
110  const m128i dcba = lo128(hgfedcba);
111  m128i l = _mm_min_epi32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
112  m128i h = _mm_max_epi32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
113 
114  m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea
115  m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc
116 
117  l = _mm_min_epi32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
118  h = _mm_max_epi32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
119 
120  x = _mm_min_epi32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
121  y = _mm_max_epi32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
122 
123  m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
124  m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0
125 
126  // _mm_extract_epi32 from clang < 3.4 returns an unsigned int - the static_cast is free for
127  // conforming compilers, but fixes broken ones
128  if (VC_IS_UNLIKELY(static_cast<int>(_mm_extract_epi32(x, 2)) >= static_cast<int>(_mm_extract_epi32(y, 1)))) {
129  return concat(Reg::permute<X0, X1, X2, X3>(b), a);
130  } else if (VC_IS_UNLIKELY(static_cast<int>(_mm_extract_epi32(x, 0)) >= static_cast<int>(_mm_extract_epi32(y, 3)))) {
131  return concat(a, Reg::permute<X0, X1, X2, X3>(b));
132  }
133 
134  // merge
135  l = _mm_min_epi32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
136  h = _mm_max_epi32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
137 
138  a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
139  b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
140  l = _mm_min_epi32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
141  h = _mm_max_epi32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
142 
143  a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
144  b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
145  l = _mm_min_epi32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
146  h = _mm_max_epi32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
147 
148  return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h));
149 }
150 
152 {
153  VectorType hgfedcba = _hgfedcba;
154  const m128i hgfe = hi128(hgfedcba);
155  const m128i dcba = lo128(hgfedcba);
156  m128i l = _mm_min_epu32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
157  m128i h = _mm_max_epu32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
158 
159  m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea
160  m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc
161 
162  l = _mm_min_epu32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
163  h = _mm_max_epu32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
164 
165  x = _mm_min_epu32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
166  y = _mm_max_epu32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
167 
168  m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
169  m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0
170 
171  if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 2) >= _mm_extract_epu32(y, 1))) {
172  return concat(Reg::permute<X0, X1, X2, X3>(b), a);
173  } else if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 0) >= _mm_extract_epu32(y, 3))) {
174  return concat(a, Reg::permute<X0, X1, X2, X3>(b));
175  }
176 
177  // merge
178  l = _mm_min_epu32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
179  h = _mm_max_epu32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
180 
181  a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
182  b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
183  l = _mm_min_epu32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
184  h = _mm_max_epu32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
185 
186  a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
187  b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
188  l = _mm_min_epu32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
189  h = _mm_max_epu32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
190 
191  return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h));
192 }
193 
194 template<> m256 SortHelper<float>::sort(VTArg _hgfedcba)
195 {
196  VectorType hgfedcba = _hgfedcba;
197  const m128 hgfe = hi128(hgfedcba);
198  const m128 dcba = lo128(hgfedcba);
199  m128 l = _mm_min_ps(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
200  m128 h = _mm_max_ps(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
201 
202  m128 x = _mm_unpacklo_ps(l, h); // ↑fb ↓fb ↑ea ↓ea
203  m128 y = _mm_unpackhi_ps(l, h); // ↑hd ↓hd ↑gc ↓gc
204 
205  l = _mm_min_ps(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
206  h = _mm_max_ps(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
207 
208  x = _mm_min_ps(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
209  y = _mm_max_ps(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
210 
211  m128 a = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(x), _mm_castps_pd(y))); // a3 >= a2 >= a1 >= a0
212  m128 b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
213 
214  // merge
215  l = _mm_min_ps(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
216  h = _mm_max_ps(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
217 
218  a = _mm_unpacklo_ps(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
219  b = _mm_unpackhi_ps(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
220  l = _mm_min_ps(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
221  h = _mm_max_ps(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
222 
223  a = _mm_unpacklo_ps(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
224  b = _mm_unpackhi_ps(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
225  l = _mm_min_ps(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
226  h = _mm_max_ps(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
227 
228  return concat(_mm_unpacklo_ps(l, h), _mm_unpackhi_ps(l, h));
229 }
230 
231 template<> m256 SortHelper<sfloat>::sort(VTArg hgfedcba)
232 {
233  return SortHelper<float>::sort(hgfedcba);
234 }
235 
237 {
238  m256d l = _mm256_min_pd(x, y); // ↓x3y3 ↓x2y2 ↓x1y1 ↓x0y0
239  m256d h = _mm256_max_pd(x, y); // ↑x3y3 ↑x2y2 ↑x1y1 ↑x0y0
240  x = _mm256_unpacklo_pd(l, h); // ↑x2y2 ↓x2y2 ↑x0y0 ↓x0y0
241  y = _mm256_unpackhi_pd(l, h); // ↑x3y3 ↓x3y3 ↑x1y1 ↓x1y1
242  l = _mm256_min_pd(x, y); // ↓(↑x2y2,↑x3y3) ↓x3x2y3y2 ↓(↑x0y0,↑x1y1) ↓x1x0y1y0
243  h = _mm256_max_pd(x, y); // ↑x3x2y3y2 ↑(↓x2y2,↓x3y3) ↑x1x0y1y0 ↑(↓x0y0,↓x1y1)
244  x = _mm256_unpacklo_pd(l, h); // ↑(↓x2y2,↓x3y3) ↓x3x2y3y2 ↑(↓x0y0,↓x1y1) ↓x1x0y1y0
245  y = _mm256_unpackhi_pd(h, l); // ↓(↑x2y2,↑x3y3) ↑x3x2y3y2 ↓(↑x0y0,↑x1y1) ↑x1x0y1y0
246  l = _mm256_min_pd(x, y); // ↓(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↓x3x2y3y2 ↓(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↓x1x0y1y0
247  h = _mm256_max_pd(x, y); // ↑(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↑x3x2y3y2 ↑(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↑x1x0y1y0
248  m256d a = Reg::permute<X2, X3, X1, X0>(Reg::permute128<X0, X1>(h, h)); // h0 h1 h3 h2
249  m256d b = Reg::permute<X2, X3, X1, X0>(l); // l2 l3 l1 l0
250 
251  // a3 >= a2 >= b1 >= b0
252  // b3 <= b2 <= a1 <= a0
253 
254  // merge
255  l = _mm256_min_pd(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
256  h = _mm256_min_pd(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
257 
258  x = _mm256_unpacklo_pd(l, h); // ↑a2b2 ↓a2b2 ↑a0b0 ↓a0b0
259  y = _mm256_unpackhi_pd(l, h); // ↑a3b3 ↓a3b3 ↑a1b1 ↓a1b1
260  l = _mm256_min_pd(x, y); // ↓(↑a2b2,↑a3b3) ↓a2b3 ↓(↑a0b0,↑a1b1) ↓a1b0
261  h = _mm256_min_pd(x, y); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↑a0b1 ↑(↓a0b0,↓a1b1)
262 
263  x = Reg::permute128<Y0, X0>(l, h); // ↑a0b1 ↑(↓a0b0,↓a1b1) ↓(↑a0b0,↑a1b1) ↓a1b0
264  y = Reg::permute128<Y1, X1>(l, h); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↓(↑a2b2,↑a3b3) ↓a2b3
265  l = _mm256_min_pd(x, y); // ↓(↑a0b1,↑a3b2) ↓(↑(↓a0b0,↓a1b1) ↑(↓a2b2,↓a3b3)) ↓(↑a0b0,↑a1b1,↑a2b2,↑a3b3) ↓b0b3
266  h = _mm256_min_pd(x, y); // ↑a0a3 ↑(↓a0b0,↓a1b1,↓a2b2,↓a3b3) ↑(↓(↑a0b0,↑a1b1) ↓(↑a2b2,↑a3b3)) ↑(↓a1b0,↓a2b3)
267 
268  x = _mm256_unpacklo_pd(l, h); // h2 l2 h0 l0
269  y = _mm256_unpackhi_pd(l, h); // h3 l3 h1 l1
270 }
272 {
273  VectorType dcba = _dcba;
274  /*
275  * to find the second largest number find
276  * max(min(max(ab),max(cd)), min(max(ad),max(bc)))
277  * or
278  * max(max(min(ab),min(cd)), min(max(ab),max(cd)))
279  *
280  const m256d adcb = avx_cast<m256d>(concat(_mm_alignr_epi8(avx_cast<m128i>(dc), avx_cast<m128i>(ba), 8), _mm_alignr_epi8(avx_cast<m128i>(ba), avx_cast<m128i>(dc), 8)));
281  const m256d l = _mm256_min_pd(dcba, adcb); // min(ad cd bc ab)
282  const m256d h = _mm256_max_pd(dcba, adcb); // max(ad cd bc ab)
283  // max(h3, h1)
284  // max(min(h0,h2), min(h3,h1))
285  // min(max(l0,l2), max(l3,l1))
286  // min(l3, l1)
287 
288  const m256d ll = _mm256_min_pd(h, Reg::permute128<X0, X1>(h, h)); // min(h3h1 h2h0 h1h3 h0h2)
289  //const m256d hh = _mm256_max_pd(h3 ll1_3 l1 l0, h1 ll0_2 l3 l2);
290  const m256d hh = _mm256_max_pd(
291  Reg::permute128<X1, Y0>(_mm256_unpackhi_pd(ll, h), l),
292  Reg::permute128<X0, Y1>(_mm256_blend_pd(h ll, 0x1), l));
293  _mm256_min_pd(hh0, hh1
294  */
295 
296  //////////////////////////////////////////////////////////////////////////////////
297  // max(max(ac), max(bd))
298  // max(max(min(ac),min(bd)), min(max(ac),max(bd)))
299  // min(max(min(ac),min(bd)), min(max(ac),max(bd)))
300  // min(min(ac), min(bd))
301  m128d l = _mm_min_pd(lo128(dcba), hi128(dcba)); // min(bd) min(ac)
302  m128d h = _mm_max_pd(lo128(dcba), hi128(dcba)); // max(bd) max(ac)
303  m128d h0_l0 = _mm_unpacklo_pd(l, h);
304  m128d h1_l1 = _mm_unpackhi_pd(l, h);
305  l = _mm_min_pd(h0_l0, h1_l1);
306  h = _mm_max_pd(h0_l0, h1_l1);
307  return concat(
308  _mm_min_pd(l, Reg::permute<X0, X0>(h)),
309  _mm_max_pd(h, Reg::permute<X1, X1>(l))
310  );
311  // extract: 1 cycle
312  // min/max: 4 cycles
313  // unpacklo/hi: 2 cycles
314  // min/max: 4 cycles
315  // permute: 1 cycle
316  // min/max: 4 cycles
317  // insert: 1 cycle
318  // ----------------------
319  // total: 17 cycles
320 
321  /*
322  m256d cdab = Reg::permute<X2, X3, X0, X1>(dcba);
323  m256d l = _mm256_min_pd(dcba, cdab);
324  m256d h = _mm256_max_pd(dcba, cdab);
325  m256d maxmin_ba = Reg::permute128<X0, Y0>(l, h);
326  m256d maxmin_dc = Reg::permute128<X1, Y1>(l, h);
327 
328  l = _mm256_min_pd(maxmin_ba, maxmin_dc);
329  h = _mm256_max_pd(maxmin_ba, maxmin_dc);
330 
331  return _mm256_blend_pd(h, l, 0x55);
332  */
333 
334  /*
335  // a b c d
336  // b a d c
337  // sort pairs
338  m256d y, l, h;
339  m128d l2, h2;
340  y = shuffle<X1, Y0, X3, Y2>(x, x);
341  l = _mm256_min_pd(x, y); // min[ab ab cd cd]
342  h = _mm256_max_pd(x, y); // max[ab ab cd cd]
343 
344  // 1 of 2 is at [0]
345  // 1 of 4 is at [1]
346  // 1 of 4 is at [2]
347  // 1 of 2 is at [3]
348 
349  // don't be fooled by unpack here. It works differently for AVX pd than for SSE ps
350  x = _mm256_unpacklo_pd(l, h); // l_ab h_ab l_cd h_cd
351  l2 = _mm_min_pd(lo128(x), hi128(x)); // l_abcd l(h_ab hcd)
352  h2 = _mm_max_pd(lo128(x), hi128(x)); // h(l_ab l_cd) h_abcd
353 
354  // either it is:
355  return concat(l2, h2);
356  // or:
357  // concat(_mm_unpacklo_pd(l2, h2), _mm_unpackhi_pd(l2, h2));
358 
359  // I'd like to have four useful compares
360  const m128d dc = hi128(dcba);
361  const m128d ba = lo128(dcba);
362  const m256d adcb = avx_cast<m256d>(concat(_mm_alignr_epi8(avx_cast<m128i>(dc), avx_cast<m128i>(ba), 8), _mm_alignr_epi8(avx_cast<m128i>(ba), avx_cast<m128i>(dc), 8)));
363 
364  const int extraCmp = _mm_movemask_pd(_mm_cmpgt_pd(dc, ba));
365  // 0x0: d <= b && c <= a
366  // 0x1: d <= b && c > a
367  // 0x2: d > b && c <= a
368  // 0x3: d > b && c > a
369 
370  switch (_mm256_movemask_pd(_mm256_cmpgt_pd(dcba, adcb))) {
371  // impossible: 0x0, 0xf
372  case 0x1: // a <= b && b <= c && c <= d && d > a
373  // abcd
374  return Reg::permute<X2, X3, X0, X1>(Reg::permute<X0, X1>(dcba, dcba));
375  case 0x2: // a <= b && b <= c && c > d && d <= a
376  // dabc
377  return Reg::permute<X2, X3, X0, X1>(adcb);
378  case 0x3: // a <= b && b <= c && c > d && d > a
379  // a[bd]c
380  if (extraCmp & 2) {
381  // abdc
382  return Reg::permute<X2, X3, X1, X0>(Reg::permute<X0, X1>(dcba, dcba));
383  } else {
384  // adbc
385  return Reg::permute<X3, X2, X0, X1>(adcb);
386  }
387  case 0x4: // a <= b && b > c && c <= d && d <= a
388  // cdab;
389  return Reg::permute<X2, X3, X0, X1>(dcba);
390  case 0x5: // a <= b && b > c && c <= d && d > a
391  // [ac] < [bd]
392  switch (extraCmp) {
393  case 0x0: // d <= b && c <= a
394  // cadb
395  return shuffle<>(dcba, bcda);
396  case 0x1: // d <= b && c > a
397  case 0x2: // d > b && c <= a
398  case 0x3: // d > b && c > a
399  }
400  case 0x6: // a <= b && b > c && c > d && d <= a
401  // d[ac]b
402  case 0x7: // a <= b && b > c && c > d && d > a
403  // adcb;
404  return permute<X1, X0, X3, X2>(permute128<X1, X0>(bcda, bcda));
405  case 0x8: // a > b && b <= c && c <= d && d <= a
406  return bcda;
407  case 0x9: // a > b && b <= c && c <= d && d > a
408  // b[ac]d;
409  case 0xa: // a > b && b <= c && c > d && d <= a
410  // [ac] > [bd]
411  case 0xb: // a > b && b <= c && c > d && d > a
412  // badc;
413  return permute128<X1, X0>(dcba);
414  case 0xc: // a > b && b > c && c <= d && d <= a
415  // c[bd]a;
416  case 0xd: // a > b && b > c && c <= d && d > a
417  // cbad;
418  return permute<X1, X0, X3, X2>(bcda);
419  case 0xe: // a > b && b > c && c > d && d <= a
420  return dcba;
421  }
422  */
423 }
424 
425 } // namespace AVX
426 } // namespace Vc
427 } // namespace ROOT
static VectorType sort(VTArg)
static Vc_INTRINSIC unsigned int Vc_CONST _mm_extract_epu32(param128i x, const int i)
Definition: intrinsics.h:219
Namespace for new ROOT classes and functions.
Definition: ROOT.py:1
TH1 * h
Definition: legend2.C:5
__m128i m128i
Definition: intrinsics.h:112
__m256d m256d
Definition: intrinsics.h:114
TArc * a
Definition: textangle.C:12
VectorTypeHelper< T >::Type VectorType
Definition: sorthelper.h:32
Vc_INTRINSIC Vc_CONST m128 hi128(param256 v)
Definition: casts.h:118
Double_t x[n]
Definition: legend1.C:17
const VectorType VTArg
Definition: sorthelper.h:36
Vc_INTRINSIC Vc_CONST m256 concat(param128 a, param128 b)
Definition: casts.h:123
#define AVX
Definition: global.h:90
#define VC_IS_UNLIKELY(x)
Definition: macros.h:143
__m256i m256i
Definition: intrinsics.h:115
TLine * l
Definition: textangle.C:4
Double_t y[n]
Definition: legend1.C:17
#define VC_RESTRICT
Definition: macros.h:145
Definition: casts.h:28
float type_of_call hi(const int &, const int &)
__m128d m128d
Definition: intrinsics.h:111
Vc_INTRINSIC Vc_CONST m128 lo128(param256 v)
Definition: casts.h:115