@@ -2211,6 +2211,8 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
22112211 }
22122212 int vec_elements = get_vector_num_elements (vecs[0 ]->getType ());
22132213
2214+ int factor = gcd (vec_elements, (int )vecs.size ());
2215+
22142216 if (vecs.size () == 1 ) {
22152217 return vecs[0 ];
22162218 } else if (vecs.size () == 2 ) {
@@ -2221,57 +2223,97 @@ Value *CodeGen_LLVM::interleave_vectors(const std::vector<Value *> &vecs) {
22212223 indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements;
22222224 }
22232225 return optimization_fence (shuffle_vectors (a, b, indices));
2224- } else {
2225- // Grab the even and odd elements of vecs.
2226- vector<Value *> even_vecs;
2227- vector<Value *> odd_vecs;
2228- for (size_t i = 0 ; i < vecs.size (); i++) {
2229- if (i % 2 == 0 ) {
2230- even_vecs.push_back (vecs[i]);
2231- } else {
2232- odd_vecs.push_back (vecs[i]);
2226+ } else if (factor == 1 ) {
2227+ // The number of vectors and the vector length is
2228+ // coprime. (E.g. interleaving an odd number of vectors of some
2229+ // power-of-two length). Use the algorithm from "A Decomposition for
2230+ // In-place Matrix Transposition" by Catanzaro et al.
2231+ std::vector<Value *> v = vecs;
2232+
2233+ // Using unary shuffles, get each element into the right ultimate
2234+ // lane. This works out without collisions because the number of vectors
2235+ // and the length of each vector is coprime.
2236+ const int num_vecs = (int )v.size ();
2237+ std::vector<int > shuffle (vec_elements);
2238+ for (int i = 0 ; i < num_vecs; i++) {
2239+ for (int j = 0 ; j < vec_elements; j++) {
2240+ int k = j * num_vecs + i;
2241+ shuffle[k % vec_elements] = j;
22332242 }
2243+ v[i] = shuffle_vectors (v[i], v[i], shuffle);
22342244 }
22352245
2236- // If the number of vecs is odd, save the last one for later.
2237- Value *last = nullptr ;
2238- if (even_vecs.size () > odd_vecs.size ()) {
2239- last = even_vecs.back ();
2240- even_vecs.pop_back ();
2246+ // We intentionally don't put an optimization fence after the unary
2247+ // shuffles, because some architectures have a two-way shuffle, so it
2248+ // helps to fuse the unary shuffle into the first layer of two-way
2249+ // blends below.
2250+
2251+ // Now we need to transfer the elements across the vectors. If we
2252+ // reorder the vectors, this becomes a rotation across the vectors of a
2253+ // different amount per lane.
2254+ std::vector<Value *> new_v (v.size ());
2255+ for (int i = 0 ; i < num_vecs; i++) {
2256+ int j = (i * vec_elements) % num_vecs;
2257+ new_v[i] = v[j];
22412258 }
2242- internal_assert (even_vecs. size () == odd_vecs. size () );
2259+ v. swap (new_v );
22432260
2244- // Interleave the even and odd parts.
2245- Value *even = interleave_vectors (even_vecs);
2246- Value *odd = interleave_vectors (odd_vecs);
2261+ std::vector<int > rotation (vec_elements, 0 );
2262+ for (int i = 0 ; i < vec_elements; i++) {
2263+ int k = (i * num_vecs) % vec_elements;
2264+ rotation[k] = (i * num_vecs) / vec_elements;
2265+ }
2266+ internal_assert (rotation[0 ] == 0 );
22472267
2248- if (last) {
2249- int result_elements = vec_elements * vecs.size ();
2268+ // We'll handle each bit of the rotation one at a time with a two-way
2269+ // shuffle.
2270+ int d = 1 ;
2271+ while (d < num_vecs) {
22502272
2251- // Interleave even and odd, leaving a space for the last element.
2252- vector<int > indices (result_elements, -1 );
2253- for (int i = 0 , idx = 0 ; i < result_elements; i++) {
2254- if (i % vecs.size () < vecs.size () - 1 ) {
2255- indices[i] = idx % 2 == 0 ? idx / 2 : idx / 2 + vec_elements * even_vecs.size ();
2256- idx++;
2257- }
2273+ for (int i = 0 ; i < vec_elements; i++) {
2274+ shuffle[i] = ((rotation[i] & d) == 0 ) ? i : (i + vec_elements);
22582275 }
2259- Value *even_odd = shuffle_vectors (even, odd, indices);
22602276
2261- // Interleave the last vector into the result.
2262- last = slice_vector (last, 0 , result_elements);
2263- for (int i = 0 ; i < result_elements; i++) {
2264- if (i % vecs.size () < vecs.size () - 1 ) {
2265- indices[i] = i;
2266- } else {
2267- indices[i] = i / vecs.size () + result_elements;
2268- }
2277+ for (int i = 0 ; i < num_vecs; i++) {
2278+ int j = (i + num_vecs - d) % num_vecs;
2279+ new_v[i] = shuffle_vectors (v[i], v[j], shuffle);
22692280 }
22702281
2271- return shuffle_vectors (even_odd, last, indices );
2272- } else {
2273- return interleave_vectors ({even, odd}) ;
2282+ v. swap (new_v );
2283+
2284+ d *= 2 ;
22742285 }
2286+
2287+ return concat_vectors (v);
2288+
2289+ } else {
2290+ // The number of vectors shares a factor with the length of the
2291+ // vectors. Pick some large factor of the number of vectors, interleave
2292+ // in separate groups, and then interleave the results.
2293+ const int n = (int )vecs.size ();
2294+ int f = 1 ;
2295+ for (int i = 2 ; i < n; i++) {
2296+ if (n % i == 0 ) {
2297+ f = i;
2298+ break ;
2299+ }
2300+ }
2301+
2302+ internal_assert (f > 1 && f < n);
2303+
2304+ vector<vector<Value *>> groups (f);
2305+ for (size_t i = 0 ; i < vecs.size (); i++) {
2306+ groups[i % f].push_back (vecs[i]);
2307+ }
2308+
2309+ // Interleave each group
2310+ vector<Value *> interleaved (f);
2311+ for (int i = 0 ; i < f; i++) {
2312+ interleaved[i] = optimization_fence (interleave_vectors (groups[i]));
2313+ }
2314+
2315+ // Interleave the result
2316+ return interleave_vectors (interleaved);
22752317 }
22762318}
22772319
0 commit comments