// log entry ID: r...m..a. 2026-05-28 15:26:55 EEST

// copyright Eric Toombs. License: GPLv3.

// Shearing, take II
// =================

// Since my last log entry, I found a better way to run the shearing algorithm.
// The basic idea is the same, but I start from the bottom instead of the top.
// This allows the whole algorithm to run in-place.
// From this, I went from around 1.8x in the last log entry to 2.7x speedup.

// This new version of the algorithm should also be the easiest way for compute units in a GPU to communicate.

// To run
// ======
// gcc -O3 -o shear-v2 shear-v2.c && ./shear-v2

// typical output:
// shearing time    = 1853015756 ns
// no shearing time = 5085624703 ns
// 
// speedup: 2.74x
// Checking results...
// Check succeeded! :D


// The code
// ========


// Dependencies
// ------------
#include <inttypes.h>
typedef uint8_t u8; typedef uint16_t u16; typedef uint32_t u32;
typedef uint64_t u64; typedef unsigned __int128 u128;
typedef  int8_t i8; typedef  int16_t i16; typedef  int32_t i32;
typedef  int64_t i64; typedef __int128 i128;
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <stdio.h>
// Subtract two timespecs.
i64 ts_sub(struct timespec *y, struct timespec *x)
{
    return 1000*1000*1000*(y->tv_sec - x->tv_sec) + y->tv_nsec - x->tv_nsec;
}
void timets(struct timespec *ts)
{
    i32 res = clock_gettime(CLOCK_MONOTONIC, ts);
    assert(res == 0);
}
// Return the difference in nanoseconds between old and the current time, and set old to the current time.
i64 deltats(struct timespec *old)
{
    struct timespec new;
    timets(&new);
    i64 delta = ts_sub(&new, old);
    *old = new;
    return delta;
}


// Program
// -------
#define MEMORY (1<<30)
#define BSIZE 16
#define NB (MEMORY/sizeof(u64)/BSIZE)
#define NT 32
#define NX (BSIZE*NB + NT + 1)

u64 rule(u64 a, u64 b, u64 c)
{
    return a - 2*b + c;
}

u64 bc = 0;
void ic(u64 *v)
{
    for (u64 a = 0; a < NX; a++)
        v[a] = a*a/2 - a;
    v[0] = bc;
    v[NX-1] = bc;
}

int main(int argc, const char **argv)
{
    struct timespec rt;
    timets(&rt);


    // Shearing
    // --------
    u64 *w = malloc((NX + NT)*sizeof(u64));
    assert(w != NULL);
    ic(w);

    // Beginning stage
    // ---------------
    // NT = 7, as drawn.
    //      0 1 2 3 4 5 6 7
    // NX-8 i
    // NX-7 i
    // NX-6 i s
    // NX-5 i s
    // NX-4 i . s
    // NX-3 i . s
    // NX-2 i . . s
    // NX-1 o . . s
    // NX+0   o . . s
    // NX+1     o . s
    // NX+2       o . s
    // NX+3         o s
    // NX+4           o s
    // NX+5             o
    // NX+6               o
    for (u32 t = 1; t <= NT; t++) {
        w[NX + t - 1] = bc;
        for (u64 x = NX + t - 2; x >= NX - NT + 2*t - 1; x--) {
            w[x] = rule(w[x-2], w[x-1], w[x]);
        }
    }


    // Middle stage
    // ------------
    // NX = 26 = 18 + 7 + 1 = 2*BSIZE + NT + 1, NT = 7, BSIZE=9, NB=2 as drawn.
    //      0 1 2 3 4 5 6 7
    //    0 o               
    //    1 i               
    //    2 i s             
    //    3 i s             
    //    4 i . s           
    //    5 i . s           
    //    6 i . . s         
    //    7 i . . s         
    //    8 i . . . s       
    //    9 i . . . s       
    //   10 i . . . . s     
    //   11 i s . . . s     
    //   12 i s . . . . s   
    //   13 i . s . . . s   
    //   14 i . s . . . . y
    //   15 i . . s . . . y
    //   16 i . . s . . . y
    //   17 i . . . s . . y
    // NX-8 i . . . s . . y
    // NX-7 i . . . . s . y
    // NX-6 i s . . . s . y
    // NX-5 i s . . . . s y
    // NX-4 i , s . . . s y
    // NX-3 i , s . . . . y
    // NX-2 i , , s . . . y
    // NX-1 o , , s . . . y
    // NX+0   o , , s . . y
    // NX+1     o , s . . y
    // NX+2       o , s . y
    // NX+3         o s . y
    // NX+4           o s y
    // NX+5             o y
    // NX+6               o
    for (u64 b = 0; b < NB; b++) {
        for (u32 t = 1; t <= NT; t++) {
            u64 x0 = NX - NT - BSIZE*b + 2*(t - 1);
            for (u64 x = x0; x > x0 - BSIZE; x--) {
                w[x] = rule(w[x-2], w[x-1], w[x]);
            }
        }
    }

    // Final stage
    // -----------
    //      0 1 2 3 4 5 6 7
    //    0 o               
    //    1 i o             
    //    2 i s o           
    //    3 i s . o         
    //    4 i , s . o       
    //    5 i , s . . o     
    //    6 i , , s . . o   
    //    7 i , , s . . . y 
    //    8 i , , , s . . y 
    //    9 i , , , s . . y 
    //   10 i , , , , s . y 
    //   11 i , , , , s . y 
    //   12 i , , , , , s y 
    //   13 i , , , , , s y 
    //   14 i , , , , , , ,
    //   15 i , , , , , , ,
    //   16 i , , , , , , ,
    //   17 i , , , , , , ,
    // NX-8 i , , , , , , ,
    // NX-7 i , , , , , , ,
    // NX-6 i , , , , , , ,
    // NX-5 i , , , , , , ,
    // NX-4 i , , , , , , ,
    // NX-3 i , , , , , , ,
    // NX-2 i , , , , , , ,
    // NX-1 o , , , , , , ,
    // NX+0   o , , , , , ,
    // NX+1     o , , , , ,
    // NX+2       o , , , ,
    // NX+3         o , , ,
    // NX+4           o , ,
    // NX+5             o ,
    // NX+6               o
    for (u32 t = 1; t <= NT; t++) {
        for (u64 x = 2*t - 1; x >= t + 1; x--) {
            w[x] = rule(w[x-2], w[x-1], w[x]);
        }
        w[t] = bc;
    }

    i64 t_shear = deltats(&rt);

    printf("shearing time    = %ld ns\n", t_shear);


    // No shearing, for comparison
    // ---------------------------
    u64 *v0 = malloc(NX*sizeof(u64));
    assert(v0 != NULL);
    u64 *v1 = malloc(NX*sizeof(u64));
    assert(v1 != NULL);
    ic(v0);
    for (u32 t = 0; t < NT; t++) {
        for (u64 x = 1; x < NX-1; x++)
            v1[x] = rule(v0[x-1], v0[x], v0[x+1]);
        v1[0] = bc;
        v1[NX-1] = bc;
        u64 *tmp = v0;
        v0 = v1;
        v1 = tmp;
    }
    i64 t_no_shear = deltats(&rt);

    printf("no shearing time = %ld ns\n\n"
        "speedup: %.2fx\n"
        "Checking results...\n",
        t_no_shear,
        (double)(t_no_shear)/(double)(t_shear)
    );
    for (u64 x = 0; x < NX; x++) {
        if (w[x + NT] != v0[x]) {
            printf("Check failed! D:\n");
            return 1;
        }
    }
    printf("Check succeeded! :D\n");
    return 0;
}