// log entry ID: .i.m..a.r 2026-05-24 18:56:45 EEST

// copyright Eric Toombs. license: GPLv3

// Shearing: a new method in PDEs
// ==============================

// This is a test of a technique in PDEs, which I call shearing.
// A spatial shift is added to the time evolution to force information to travel only one way.
// The spatial shift can then easily be undone afterward.
// As long as the boundaries shift too, the result is identical.

// Here are some diagrams:
// (Time is horizontal, space is vertical.)
// actual stencil:
// .
// . .
// .

// sheared stencil:
// .
// .
// . .

// This forces data to flow only downward.

// Here's a complete example of shearing.
// It is shown partway through simulation.
// NX and NT are the number of points in space and time, respectively.
// As drawn, NX = 17 and NT = 7.
// "o" are the boundaries
// "i" are the initial conditions
// "y" is the final state.
// "s", "d", and "c" form the intermediate state.
// The whole intermediate state needs to be saved.
// "c" is the current calculation with "d" being its dependencies.
// o
// i o
// i . o
// i . . o
// i . . . o
// i . . . . o
// i . . . . . o
// i . . . . . . o
// i . . . . . . y
// i . . . . . . y
// i . . . . . . y
// i . . . d s s y
// i s s s d s s y
// i s s s d c . y
// i . . . . . . y
// i . . . . . . y
// o . . . . . . y
//   o . . . . . y
//     o . . . . y
//       o . . . y
//         o . . y
//           o . y
//             o y
//               o

// To run
// ======
// gcc -O3 -o shear shear.c && ./shear

// typical output:
// shearing time    = 2747911879 ns
// no shearing time = 5050869239 ns
// 
// speedup: 1.838075
// Checking results...
// Check succeeded! :D


// The code
// ========


// Dependencies
// ------------
#include <inttypes.h>
typedef uint8_t u8; typedef uint16_t u16; typedef uint32_t u32; typedef uint64_t u64;
typedef  int8_t i8; typedef  int16_t i16; typedef  int32_t i32; typedef  int64_t i64;
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <stdio.h>
// Subtract two timespecs.
i64 ts_sub(struct timespec *y, struct timespec *x)
{
    return 1000*1000*1000*(y->tv_sec - x->tv_sec) + y->tv_nsec - x->tv_nsec;
}
void timets(struct timespec *ts)
{
    i32 res = clock_gettime(CLOCK_MONOTONIC, ts);
    assert(res == 0);
}
// Return the difference in nanoseconds between old and the current time, and set old to the current time.
i64 deltats(struct timespec *old)
{
    struct timespec new;
    timets(&new);
    i64 delta = ts_sub(&new, old);
    *old = new;
    return delta;
}


// Program
// -------
// NT needs to be small, to make sure the algorithm's intermediate state fits in the cache.
// This isn't a real limitation, though.
// Just run the entire algorithm again, and you compute the next NT steps.
#define NX (1<<27)
#define NT 32

// Integer math, rather than floating point math, is used to ensure the operations are exact.
// This makes it possible to compare the sheared and unsheared calculations exactly.
u64 rule(u64 a, u64 b, u64 c)
{
    return a - 2*b + c;
}

u64 bc = 0;
void ic(u64 *v)
{
    for (u64 a = 0; a < NX; a++)
        v[a] = a*a/2 - a;
    v[0] = bc;
    v[NX-1] = bc;
}

int main(int argc, const char **argv)
{
    struct timespec rt;
    timets(&rt);


    // Shearing
    // --------
    u64 *w = malloc(NX*sizeof(u64));
    assert(w != NULL);
    ic(w);

    // Intermediate state.
    // sxm1 is at x-2, sxm1 is at x-1, and sx is at x.
    u64 *sxm2 = malloc((NT + 1)*sizeof(u64));
    assert(sxm2 != NULL);
    u64 *sxm1 = malloc((NT + 1)*sizeof(u64));
    assert(sxm1 != NULL);
    u64 *sx = malloc((NT + 1)*sizeof(u64));
    assert(sx != NULL);

    // Beginning stage
    // ---------------
    // c is the current value of x and t inside the loop.
    //   0 1 2 3 4 5 6 7
    // 0 o
    // 1 i o
    // 2 i . o
    // 3 i . . o
    // 4 i . . . o
    // 5 i . . d s o     sxm2
    // 6 i s s d s s o   sxm1
    // 7 i s s d c . . o sx
    sxm1[0] = bc;
    for (u32 x = 1; x <= NT; x++) {
        sx[0] = w[x];
        for (u32 t = 1; t < x; t++)
            sx[t] = rule(sxm2[t-1], sxm1[t-1], sx[t-1]);
        sx[x] = bc;

        u64 *tmp = sxm2;
        sxm2 = sxm1;
        sxm1 = sx;
        sx = tmp;
    }


    // Middle stage
    // ------------
    // Rows x=8 and x=9 require data from the beginning stage, so they have been included here.
    //  6 i . . . . . o     (done)
    //  7 i . . . . . . o   (done)
    //  8 i . . . . . . y
    //  9 i . . . . . . y
    // 10 i . . . . . . y
    // 11 i . . . d s s y  sxm2
    // 12 i s s s d s s y  sxm1
    // 13 i s s s d c . y  sx
    // 14 i . . . . . . y
    // 15 i . . . . . . y
    // 16 o . . . . . . y
    for (u64 x = NT+1; x < NX; x++) {
        sx[0] = w[x];
        for (u32 t = 1; t <= NT; t++)
            sx[t] = rule(sxm2[t-1], sxm1[t-1], sx[t-1]);
        w[x - NT] = sx[NT];

        u64 *tmp = sxm2;
        sxm2 = sxm1;
        sxm1 = sx;
        sx = tmp;
    }

    // Final stage
    // -----------
    // 15 i . . . . . . y (done)
    // 16 o . . . . . . y (done)
    // 17   o . . d s s y  sxm2
    // 18     o s d s s y  sxm1
    // 19       o d c . y  sx
    // 20         o . . y
    // 21           o . y
    // 22             o y
    for (u64 x = NX; x <= NX + NT - 2; x++) {
        sx[x - NX + 1] = bc;
        for (u32 t = x - NX + 2; t <= NT; t++)
            sx[t] = rule(sxm2[t-1], sxm1[t-1], sx[t-1]);
        w[x - NT] = sx[NT];

        u64 *tmp = sxm2;
        sxm2 = sxm1;
        sxm1 = sx;
        sx = tmp;
    }

    i64 t_shear = deltats(&rt);

    printf("shearing time    = %ld ns\n", t_shear);


    // No shearing, for comparison
    // ---------------------------
    u64 *v0 = malloc(NX*sizeof(u64));
    assert(v0 != NULL);
    u64 *v1 = malloc(NX*sizeof(u64));
    assert(v1 != NULL);
    ic(v0);
    for (u32 t = 0; t < NT; t++) {
        for (u64 x = 1; x < NX-1; x++)
            v1[x] = rule(v0[x-1], v0[x], v0[x+1]);
        v1[0] = bc;
        v1[NX-1] = bc;
        u64 *tmp = v0;
        v0 = v1;
        v1 = tmp;
    }
    i64 t_no_shear = deltats(&rt);

    printf("no shearing time = %ld ns\n\n"
        "speedup: %.2fx\n"
        "Checking results...\n", t_no_shear, (double)(t_no_shear)/(double)(t_shear));
    for (u64 x = 0; x < NX; x++) {
        if (w[x] != v0[x]) {
            printf("Check failed! D:\n");
            return 1;
        }
    }
    printf("Check succeeded! :D\n");
    return 0;
}

// Conclusion, further directions
// ==============================
// In this code, it is demonstrated that shearing exploits the RAM cache better, speeding up calculation.
// The method is useful elsewhere, too, though.

// It is easy to compute W rows at the same time with one W-wide SIMD operation.
// This is how a GPU would do it, if they only had one compute unit.
// (They don't! Don't actually do this!)
// Here, W=12.
// .
// . .
// . . .
// . . . .
// . . . . .
// . . . . . .
// . . . d . . .
// . . . d . . . .
// . . . d c
// . . . d c
// . . . d c
// . . . d c
// . . . d c
// . . . d c
// . . . d c
// . . . d c
// . . . d c
// . . . d c
// . . . d c
// . . . d c


// This is how you would *actually* do it on a GPU, since GPUs are composed of many communicating compute units, with finite latency between them.
// Using shearing, the communication constraints between compute units can be greatly relaxed.
// Rather than forcing synchronisation on every time step, the compute units working on the top part of the space must run ahead of those working on the bottom.
// Then, none of the compute units needs to be exactly synchronised anymore.
// Here's a diagram of this:
// .
// . .
// . . .
// . . . .
// --------
// . . . . .
// . . . . . .
// . . . . . . .
// . . . . . . . .
// ----------------
// . . . . . . . . .
// . . . . . . . . . .
// . . . . . . . . . . .
// . . . . . . . . . . . .
// ------------------------
// . . . . . . . . . . . . d
// . . . . . . . . . . . . d c
// . . . . . . . . . . . r d c
// . . . . . . . . . . . r d c
// ----------------------------
// . . . . . . . . . . . d c
//   . . . . . . . . . . d c
//     . . . . . . . r . d c
//       . . . . . . r . d c
//        -------------------
//         . . . . . d c
//           . . . . d c
//             . . . d c
//               . . d c
// ---- indicates boundaries between compute units.
// "c" and "d" mean the same thing as before.
// "r" indicates a remote depenency.
// Every four rows is one job.
// The first three jobs have completed, and the compute units that were working on them have been given new jobs.

// The greatest potential is in parallel computation between computers on a high-latency, unreliable, yet high throughput network.
// It allows for collaboration over the internet that would otherwise be impossible.


// This method easily extends to N spatial dimensions and to P-stencils.
// "r" is read, "w" is write, and "a" is for read and write.

// 2D, unsheared:
//   r
// r a r
//   r

// 2D, sheared:
//   r
// r r r
//   r w

// Now, information only flows downward and to the right.

// 1D 5-stencil, unsheared:
// r
// r
// r w
// r
// r

// 1D 5-stencil, sheared:
// r
// r
// r
// r
// r w

// Information is only flowing down, just like the 3-stencil.
// The intermediate state is larger and the compute jobs can't run as long before running into the space boundary, but it still works fine.