An Extreme Learning Machine with good generalization - AMSelectWide

AMSelectWide (JS) Code:

Selecting between positive or negative weight vector blocks (with probability 1/2) based on a sign flip + WHT random projection of the input x.


The code defines a small learning module that uses pseudo-random sign flipping combined with a normalized Fast Walsh–Hadamard Transform (FWHT) to perform random projections and update a set of learned weights.

Try it out

Experiment with the code

class AMSelectWide {

  constructor(vecLen, density, rate) {

    this.vecLen = vecLen;

    this.density = density;

    this.rate = rate / density;

    this.wts = new Float32Array(2 * density * vecLen);

    this.surface = new Float32Array((density + 1) * vecLen);

    this.workA = new Float32Array(vecLen);

    this.workB = new Float32Array(vecLen);

    this.workC = new Float32Array(vecLen);

    this.flipIn = new FlipIn();

    this.flipOut = new FlipOut();

  }

  recall(result, x) {

    this.flipIn.reset();

    this.flipOut.reset();

    this.flipIn.flipNext(this.workA, x);

    whtN(this.workA);

    this.flipIn.flipNext(this.workA, this.workA);

    whtN(this.workA);

    zero(result);

    let wk = this.workA;

    if (result === this.workC) copy_up(this.surface, 0, this.workA);

    for (let i = 0; i < this.density; i++, wk = this.workB) {

      this.flipIn.flipNext(this.workB, wk);

      whtN(this.workB);

      if (result === this.workC)

        copy_up(this.surface, (i + 1) * this.vecLen, this.workB);

      let idx = 2 * i * this.vecLen;

      if(this.workA[i] > 0) idx+=this.vecLen;

      for (let j = 0; j < this.vecLen; j++) {

        result[j] += this.wts[idx + j] * this.workB[j];

      }

      whtN(result);

      this.flipOut.flipNext(result, result);

    }

  }

  train(target, x) {

    this.recall(this.workC, x);

    for (let i = 0; i < this.vecLen; i++) {

      this.workC[i] = (target[i] - this.workC[i]) * this.rate;

    }

    for (let i = this.density - 1; i >= 0; i--) {

      this.flipOut.flipPrevious(this.workC, this.workC);

      whtN(this.workC);

      let idx = 2 * i * this.vecLen;

      if(this.surface[i] > 0) idx+=this.vecLen;

      let s = (i + 1) * this.vecLen;

      for (let j = 0; j < this.vecLen; j++) {

        this.wts[idx + j] += this.surface[s + j] * this.workC[j];

      }

    }

  }

}


function copy_up(to, at, from) {

  for (let i = 0; i < from.length; i++) {

    to[at + i] = from[i];

  }

}

function zero(x) {

  for (let i = 0; i < x.length; i++) x[i] = 0;

}


// Walsh Hadamard transform scaled to leave vector magnitued unchanged

function whtN(vec) {

  const n = vec.length;

  const sc = 1.0 / sqrt(n);

  for (let i = 0; i < n; i += 2) {

    const a = vec[i];

    const b = vec[i + 1];

    vec[i] = (a + b) * sc;

    vec[i + 1] = (a - b) * sc;

  }

  let hs = 2;

  while (hs < n) {

    let i = 0;

    while (i < n) {

      const j = i + hs;

      while (i < j) {

        const a = vec[i];

        const b = vec[i + hs];

        vec[i] = a + b;

        vec[i + hs] = a - b;

        i += 1;

      }

      i += hs;

    }

    hs += hs;

  }

}


class FlipIn {

  constructor() {

    this.reset_val = 0x9e3779b9;

    this.state = this.reset_val;

  }

  reset() {

    this.state = this.reset_val;

  }

  flipNext(x, y) {

    for (let i = 0; i < x.length; i++) {

      this.state = Math.imul(this.state, 0x93d765dd) >>> 0;

      x[i] = (this.state | 0) < 0 ? -y[i] : y[i];

    }

  }

  flipPrevious(x, y) {

    for (let i = x.length - 1; i >= 0; i--) {

      x[i] = (this.state | 0) < 0 ? -y[i] : y[i];

      this.state = Math.imul(this.state, 0x7aae1a75) >>> 0;

    }

  }

}


class FlipOut {

  constructor() {

    this.reset_val = 0x3e9d9fe5;

    this.state = this.reset_val;

  }

  reset() {

    this.state = this.reset_val;

  }

  flipNext(x, y) {

    for (let i = 0; i < x.length; i++) {

      this.state = Math.imul(this.state, 0xe47135) >>> 0;

      x[i] = (this.state | 0) < 0 ? -y[i] : y[i];

    }

  }

  flipPrevious(x, y) {

    for (let i = x.length - 1; i >= 0; i--) {

      x[i] = (this.state | 0) < 0 ? -y[i] : y[i];

      this.state = Math.imul(this.state, 0xc5ed191d) >>> 0;

    }

  }

Here’s the breakdown:


High-level overview

  • Purpose:
    The AMSelectWide class is doing something similar to a sparse, random-projection-based associative memory.
    It transforms input vectors by:

    1. Flipping their signs using a deterministic pseudo-random sequence.

    2. Applying a normalized Fast Walsh–Hadamard Transform (FWHT) to mix the elements.

    3. Repeating this a few times to produce intermediate “random projections”.

    4. Using these projections to look up and combine learned weights.

  • Two modes:

    • recall() → Given an input vector, produces an output vector from learned weights.

    • train() → Given a target output vector and an input vector, updates the weights so that future recalls will be closer to the target.


The supporting components

Walsh–Hadamard transform (whtN)

  • A fast orthogonal transform similar to the FFT, but only using additions and subtractions.

  • This implementation:

    • Scales by 1/√n so that the vector magnitude stays unchanged.

    • Does the standard iterative butterfly structure of the FWHT.

  • Effect: it mixes all elements together in a structured way.


FlipIn and FlipOut

  • Both are pseudo-random sign generators based on simple multiplicative integer hashing.

  • They produce a sequence of +1 and –1 multipliers in a repeatable way.

  • flipNext(x, y) → Copies y into x but flips each element’s sign according to the PRNG sequence.

  • flipPrevious() → Same idea but runs backwards (for reversing in train()).

FlipIn and FlipOut use different seeds and multipliers, so their sign sequences differ.


Utility functions

  • copy_up(to, at, from) → Copies from into to starting at offset at.

  • zero(x) → Fills x with zeros.


Inside AMSelectWide

Constructor

  • vecLen → Length of each vector.

  • density → Number of random projections to use.

  • rate → Learning rate scaled by 1/density.

  • wts → Main weight storage array: (2 * density * vecLen) floats.
    (2 because each projection can be in one of two states: positive or negative).

  • surface → Stores intermediate projected vectors for use in training.

  • workA, workB, workC → Temporary working arrays.

  • flipIn, flipOut → PRNG sign flippers for the forward and backward passes.


Recall process

Given x (the input vector), recall(result, x):

  1. Reset both sign flippers.

  2. Step 1:

    • Random sign flip (flipIn) on xworkA.

    • Apply FWHT (whtN).

  3. Step 2:

    • Another random sign flip (still flipIn) → workA again.

    • Apply FWHT again.

    • This produces a first “projection”.

  4. If we're storing intermediate results (in workC), store this in surface.

  5. Loop over density projections:

    • Flip signs of wk into workB.

    • FWHT on workB.

    • Store in surface if training later.

    • Decide which half of the weights to use:
      If workA[i] > 0, pick the “positive” weight block, else the “negative”.

    • Add weighted projection into result.

    • FWHT on result (mix it again).

    • Flip signs with flipOut.

  6. End result: a vector derived from multiple pseudo-random transforms of x.


Training process

Given target and x, train(target, x):

  1. Run recall into workC (this also fills surface with intermediate states).

  2. Compute the error: (target - recall) * learningRate.

  3. Back-propagate through the pseudo-random transforms in reverse:

    • For each projection in reverse order:

      • Undo the last flipOut (via flipPrevious).

      • Undo the FWHT.

      • Pick correct weight block based on stored surface[i] sign.

      • Update weights proportionally to the projection and error.

  4. This modifies wts so that next recall will be closer to target.


Plain English analogy

Imagine you have a memory where each input is scrambled several times in a special but repeatable way (sign flips + Hadamard transform). Each scrambling produces a “projection” of the input, and each projection is used to fetch a set of weights. These weights are combined, scrambled again, and finally turned into your output.

When you train, you see how far off your output was from the target and “unscramble” everything in reverse order, adjusting the weights in the exact places that contributed to the output.

The randomness is not truly random — it’s generated in a deterministic way so that the same input always produces the same scramblings.


The intuition behind why this setup can generalize well:

  • Core transformation is linear
    The sign-flipping (FlipIn / FlipOut) and Walsh–Hadamard transform are both linear operations (over ℝ). If you froze the “positive/negative block choice,” the whole recall path would be purely linear in the input.

  • Mild non-linearity from block switching
    The only non-linearity comes from

    if (workA[i] > 0) idx += vecLen;

    which is essentially a binary decision based on the sign of a projection coordinate. This acts like a Bernoulli random variable in locality-sensitive hashing (LSH):

    • Probability ≈ 0.5 of positive vs. negative block choice if the projection is symmetric around 0.

    • Two similar inputs will tend to produce the same sign for many projections, so they share weight blocks often.

    • Dissimilar inputs diverge in their signs, hitting different blocks more often.

  • Why this helps generalization

    1. The projection space (after pseudo-random flipping + WHT) tends to spread information evenly while preserving distances in an average sense (like a Johnson–Lindenstrauss transform).

    2. The sign-based block selection adds a “just enough” non-linearity that groups similar inputs but doesn’t fragment the space too aggressively.

    3. Because the non-linearity is binary and sparse, the model behaves more like a set of piecewise-linear regions rather than a highly warped space.

    4. This is similar to certain randomized hashing kernels used in nearest-neighbor search — they preserve locality with high probability due to the Bernoulli nature of sign tests.

In effect, it’s like having a mildly non-linear, high-dimensional, locality-preserving hashing layer in front of a mostly linear model. That’s often a sweet spot for generalization because:

  • The linear parts are easy to train and stable.

  • The sign-switch introduces discrete “addressing” that lets the model capture more complex patterns.

  • The randomness avoids overfitting to any particular coordinate basis.


Comments

Popular posts from this blog

Neon Bulb Oscillators

23 Circuits you can Build in an Hour - Free Book

Q Multiplier Circuits