// log entry ID: .i.m..a.r 2026-05-24 18:56:45 EEST // copyright Eric Toombs. license: GPLv3 // Shearing: a new method in PDEs // ============================== // This is a test of a technique in PDEs, which I call shearing. // A spatial shift is added to the time evolution to force information to travel only one way. // The spatial shift can then easily be undone afterward. // As long as the boundaries shift too, the result is identical. // Here are some diagrams: // (Time is horizontal, space is vertical.) // actual stencil: // . // . . // . // sheared stencil: // . // . // . . // This forces data to flow only downward. // Here's a complete example of shearing. // It is shown partway through simulation. // NX and NT are the number of points in space and time, respectively. // As drawn, NX = 17 and NT = 7. // "o" are the boundaries // "i" are the initial conditions // "y" is the final state. // "s", "d", and "c" form the intermediate state. // The whole intermediate state needs to be saved. // "c" is the current calculation with "d" being its dependencies. // o // i o // i . o // i . . o // i . . . o // i . . . . o // i . . . . . o // i . . . . . . o // i . . . . . . y // i . . . . . . y // i . . . . . . y // i . . . d s s y // i s s s d s s y // i s s s d c . y // i . . . . . . y // i . . . . . . y // o . . . . . . y // o . . . . . y // o . . . . y // o . . . y // o . . y // o . y // o y // o // To run // ====== // gcc -O3 -o shear shear.c && ./shear // typical output: // shearing time = 2747911879 ns // no shearing time = 5050869239 ns // // speedup: 1.838075 // Checking results... // Check succeeded! :D // The code // ======== // Dependencies // ------------ #include typedef uint8_t u8; typedef uint16_t u16; typedef uint32_t u32; typedef uint64_t u64; typedef int8_t i8; typedef int16_t i16; typedef int32_t i32; typedef int64_t i64; #include #include #include #include // Subtract two timespecs. i64 ts_sub(struct timespec *y, struct timespec *x) { return 1000*1000*1000*(y->tv_sec - x->tv_sec) + y->tv_nsec - x->tv_nsec; } void timets(struct timespec *ts) { i32 res = clock_gettime(CLOCK_MONOTONIC, ts); assert(res == 0); } // Return the difference in nanoseconds between old and the current time, and set old to the current time. i64 deltats(struct timespec *old) { struct timespec new; timets(&new); i64 delta = ts_sub(&new, old); *old = new; return delta; } // Program // ------- // NT needs to be small, to make sure the algorithm's intermediate state fits in the cache. // This isn't a real limitation, though. // Just run the entire algorithm again, and you compute the next NT steps. #define NX (1<<27) #define NT 32 // Integer math, rather than floating point math, is used to ensure the operations are exact. // This makes it possible to compare the sheared and unsheared calculations exactly. u64 rule(u64 a, u64 b, u64 c) { return a - 2*b + c; } u64 bc = 0; void ic(u64 *v) { for (u64 a = 0; a < NX; a++) v[a] = a*a/2 - a; v[0] = bc; v[NX-1] = bc; } int main(int argc, const char **argv) { struct timespec rt; timets(&rt); // Shearing // -------- u64 *w = malloc(NX*sizeof(u64)); assert(w != NULL); ic(w); // Intermediate state. // sxm1 is at x-2, sxm1 is at x-1, and sx is at x. u64 *sxm2 = malloc((NT + 1)*sizeof(u64)); assert(sxm2 != NULL); u64 *sxm1 = malloc((NT + 1)*sizeof(u64)); assert(sxm1 != NULL); u64 *sx = malloc((NT + 1)*sizeof(u64)); assert(sx != NULL); // Beginning stage // --------------- // c is the current value of x and t inside the loop. // 0 1 2 3 4 5 6 7 // 0 o // 1 i o // 2 i . o // 3 i . . o // 4 i . . . o // 5 i . . d s o sxm2 // 6 i s s d s s o sxm1 // 7 i s s d c . . o sx sxm1[0] = bc; for (u32 x = 1; x <= NT; x++) { sx[0] = w[x]; for (u32 t = 1; t < x; t++) sx[t] = rule(sxm2[t-1], sxm1[t-1], sx[t-1]); sx[x] = bc; u64 *tmp = sxm2; sxm2 = sxm1; sxm1 = sx; sx = tmp; } // Middle stage // ------------ // Rows x=8 and x=9 require data from the beginning stage, so they have been included here. // 6 i . . . . . o (done) // 7 i . . . . . . o (done) // 8 i . . . . . . y // 9 i . . . . . . y // 10 i . . . . . . y // 11 i . . . d s s y sxm2 // 12 i s s s d s s y sxm1 // 13 i s s s d c . y sx // 14 i . . . . . . y // 15 i . . . . . . y // 16 o . . . . . . y for (u64 x = NT+1; x < NX; x++) { sx[0] = w[x]; for (u32 t = 1; t <= NT; t++) sx[t] = rule(sxm2[t-1], sxm1[t-1], sx[t-1]); w[x - NT] = sx[NT]; u64 *tmp = sxm2; sxm2 = sxm1; sxm1 = sx; sx = tmp; } // Final stage // ----------- // 15 i . . . . . . y (done) // 16 o . . . . . . y (done) // 17 o . . d s s y sxm2 // 18 o s d s s y sxm1 // 19 o d c . y sx // 20 o . . y // 21 o . y // 22 o y for (u64 x = NX; x <= NX + NT - 2; x++) { sx[x - NX + 1] = bc; for (u32 t = x - NX + 2; t <= NT; t++) sx[t] = rule(sxm2[t-1], sxm1[t-1], sx[t-1]); w[x - NT] = sx[NT]; u64 *tmp = sxm2; sxm2 = sxm1; sxm1 = sx; sx = tmp; } i64 t_shear = deltats(&rt); printf("shearing time = %ld ns\n", t_shear); // No shearing, for comparison // --------------------------- u64 *v0 = malloc(NX*sizeof(u64)); assert(v0 != NULL); u64 *v1 = malloc(NX*sizeof(u64)); assert(v1 != NULL); ic(v0); for (u32 t = 0; t < NT; t++) { for (u64 x = 1; x < NX-1; x++) v1[x] = rule(v0[x-1], v0[x], v0[x+1]); v1[0] = bc; v1[NX-1] = bc; u64 *tmp = v0; v0 = v1; v1 = tmp; } i64 t_no_shear = deltats(&rt); printf("no shearing time = %ld ns\n\n" "speedup: %.2fx\n" "Checking results...\n", t_no_shear, (double)(t_no_shear)/(double)(t_shear)); for (u64 x = 0; x < NX; x++) { if (w[x] != v0[x]) { printf("Check failed! D:\n"); return 1; } } printf("Check succeeded! :D\n"); return 0; } // Conclusion, further directions // ============================== // In this code, it is demonstrated that shearing exploits the RAM cache better, speeding up calculation. // The method is useful elsewhere, too, though. // It is easy to compute W rows at the same time with one W-wide SIMD operation. // This is how a GPU would do it, if they only had one compute unit. // (They don't! Don't actually do this!) // Here, W=12. // . // . . // . . . // . . . . // . . . . . // . . . . . . // . . . d . . . // . . . d . . . . // . . . d c // . . . d c // . . . d c // . . . d c // . . . d c // . . . d c // . . . d c // . . . d c // . . . d c // . . . d c // . . . d c // . . . d c // This is how you would *actually* do it on a GPU, since GPUs are composed of many communicating compute units, with finite latency between them. // Using shearing, the communication constraints between compute units can be greatly relaxed. // Rather than forcing synchronisation on every time step, the compute units working on the top part of the space must run ahead of those working on the bottom. // Then, none of the compute units needs to be exactly synchronised anymore. // Here's a diagram of this: // . // . . // . . . // . . . . // -------- // . . . . . // . . . . . . // . . . . . . . // . . . . . . . . // ---------------- // . . . . . . . . . // . . . . . . . . . . // . . . . . . . . . . . // . . . . . . . . . . . . // ------------------------ // . . . . . . . . . . . . d // . . . . . . . . . . . . d c // . . . . . . . . . . . r d c // . . . . . . . . . . . r d c // ---------------------------- // . . . . . . . . . . . d c // . . . . . . . . . . d c // . . . . . . . r . d c // . . . . . . r . d c // ------------------- // . . . . . d c // . . . . d c // . . . d c // . . d c // ---- indicates boundaries between compute units. // "c" and "d" mean the same thing as before. // "r" indicates a remote depenency. // Every four rows is one job. // The first three jobs have completed, and the compute units that were working on them have been given new jobs. // The greatest potential is in parallel computation between computers on a high-latency, unreliable, yet high throughput network. // It allows for collaboration over the internet that would otherwise be impossible. // This method easily extends to N spatial dimensions and to P-stencils. // "r" is read, "w" is write, and "a" is for read and write. // 2D, unsheared: // r // r a r // r // 2D, sheared: // r // r r r // r w // Now, information only flows downward and to the right. // 1D 5-stencil, unsheared: // r // r // r w // r // r // 1D 5-stencil, sheared: // r // r // r // r // r w // Information is only flowing down, just like the 3-stencil. // The intermediate state is larger and the compute jobs can't run as long before running into the space boundary, but it still works fine.