External Publication

RNN in C - is this BPTT finally right?

Hugging Face Forums [Unofficial] June 11, 2026

ok.. i’m back to dumping my code online, too many weeks. reading back through the thread, my best guess was perhaps i’m training on one character behind, except after a few days of training, it spat out “orgetful” yesterday, when the lnn would have been around 10.7.. so i’m obviously doing something egregiously wrong as lnn seems to go down for 50-100k tests then rises, not paralleling any dynamic in material.

#define hidd 256	//	hidden layer size
#define wind 64		//	truncated BPTT depth
#define winp (wind + 1)
#define winm (wind - 1)
while (ifile.get(c)) {
char twin = tests & winm;	//	truncated rnn windowing for indefinite serial processing
char tplus = twin + 1;
if (!(bool)twin) {
if ((bool)tests) memcpy(h[0], h[wind], sizeof h[0]);	//	if (!(bool)tests) memset(h[0], 0, sizeof h[0]);	else memcpy(h[0], h[wind], sizeof h[0]);
}

        char tc = chartonet(c);		//  "next char" (current read) "correct prediction"
        memset(netin, 0, sizeof netin);	//for (unsigned int i = 0; i < 96; i++) netin[i] = netout[i] = prevout[twin][i] = 0.f;
        netin[b] = 1.f;			//  "prev char" (previous read) "current step"
        cin[twin] = b;

        for (unsigned int i = 0; i < hidd; i++) {   //  forward pass
            float sum = hbias[i];
            sum += nni[b][i];  //  "hot one"
            for (unsigned int j = 0; j < hidd; j++) sum += nnh[i][j] * h[twin][j];	//	"logits" pre normalising
            h[tplus][i] = tanhf(sum);
        }
        float m = -1e9f;
        for (int i = 0; i < 96; i++) {
            float sum = obias[i];
            for (unsigned int j = 0; j < hidd; j++) sum += nno[j][i] * h[tplus][j];
            netout[twin][i] = sum;    m = fmax(m, netout[twin][i]);
        }
        float ssum = 0.f;
        for (unsigned int i = 0; i < 96; i++) {
            netout[twin][i] = exp(netout[twin][i] - m);	ssum += netout[twin][i];
        }
        softsum[twin] = 0;
        if (ssum > 0.f) {
            softsum[twin] = ssum; ssum = 1.f / ssum;
            for (unsigned int i = 0; i < 96; i++) netout[twin][i] *= ssum;
        }
        netout[twin][tc] -= 1.f;


        if ((bool)(tplus & wind)) {	//	eg. 64 of 64 do BPTT back propogation through time
            memset(dnni, 0, sizeof dnni);	//	'gradients'
            memset(dnnh, 0, sizeof dnnh);
            memset(dnno, 0, sizeof dnno);
            memset(dhbias, 0, sizeof dhbias);
            memset(dobias, 0, sizeof dobias);
            memset(dh_next, 0, sizeof dh_next);

            for (int iter = winm; iter > -1; iter--) {		//	BPTT 'back propogation through time'
                int iplus = iter + 1;
                for (unsigned int i = 0; i < 96; i++) {
                    dobias[i] += netout[iter][i];
                    for (unsigned int j = 0; j < hidd; j++) dnno[j][i] += netout[iter][i] * h[iplus][j];
                }
                memset(dh, 0, sizeof dh);
                for (int i = 0; i < hidd; i++) {
                    float sum = dh_next[i];
                    for (int j = 0; j < 96; j++) sum += nno[i][j] * netout[iter][j];
                    dh[i] = sum;
                }
                for (int i = 0; i < hidd; i++) {
                    dh_raw[i] = (1.f - (h[iplus][i] * h[iplus][i])) * dh[i];
                    dhbias[i] += dh_raw[i];
                    dnni[cin[iter]][i] += dh_raw[i];	//	"hot one" input instead of 96
                    for (unsigned int j = 0; j < hidd; j++) dnnh[i][j] += dh_raw[i] * h[iter][j];
                }
                for (int i = 0; i < hidd; i++) {
                    float sum = 0.f;
                    for (int j = 0; j < hidd; j++) sum += nnh[j][i] * dh_raw[j];
                    dh_next[i] = sum;
                }
                ebuf -= log(fmax(1e-18f, netout[iter][cin[iter]]));
                edif = ebuf / (float)max(1, tests);
            }

            float max_grad = 5.f;		//	gradient clipping
            for (int i = 0; i < 96; i++) {
                for (int j = 0; j < hidd; j++) dnni[i][j] = fmax(-max_grad, fmin(max_grad, dnni[i][j]));
                dobias[i] = fmax(-max_grad, fmin(max_grad, dobias[i]));
            }
            for (int i = 0; i < hidd; i++) {
                for (int j = 0; j < hidd; j++) dnnh[i][j] = fmax(-max_grad, fmin(max_grad, dnnh[i][j]));
                for (int j = 0; j < 96; j++) dnno[i][j] = fmax(-max_grad, fmin(max_grad, dnno[i][j]));
                dhbias[i] = fmax(-max_grad, fmin(max_grad, dhbias[i]));
            }
for (int i = 0; i < 96; i++) {	//	apply learning ;)
                            for (int j = 0; j < hidd; j++) {
                                nni[i][j] -= dnni[i][j] * learn;
                                nno[j][i] -= dnno[j][i] * learn;
                            }
                            obias[i] -= dobias[i] * learn;
                        }
                        for (int i = 0; i < hidd; i++) {
                            hbias[i] -= dhbias[i] * learn;
                            for (int j = 0; j < hidd; j++) nnh[i][j] -= dnnh[i][j] * learn;
                        }

Discussion in the ATmosphere