cavis/cavis-dnn/cavis-dnn-nn/src/main/java/org/deeplearning4j/nn/layers/BaseLayer.java

/*
 *  ******************************************************************************
 *  *
 *  *
 *  * This program and the accompanying materials are made available under the
 *  * terms of the Apache License, Version 2.0 which is available at
 *  * https://www.apache.org/licenses/LICENSE-2.0.
 *  *
 *  *  See the NOTICE file distributed with this work for additional
 *  *  information regarding copyright ownership.
 *  * Unless required by applicable law or agreed to in writing, software
 *  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 *  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 *  * License for the specific language governing permissions and limitations
 *  * under the License.
 *  *
 *  * SPDX-License-Identifier: Apache-2.0
 *  *****************************************************************************
 */

package org.deeplearning4j.nn.layers;

import java.lang.reflect.Constructor;
import java.util.*;
import lombok.Getter;
import lombok.NonNull;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.deeplearning4j.exception.DL4JInvalidInputException;
import org.deeplearning4j.nn.api.ITrainableLayer;
import org.deeplearning4j.nn.api.Layer;
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
import org.deeplearning4j.nn.conf.layers.BaseLayerConfiguration;
import org.deeplearning4j.nn.conf.layers.LayerConfiguration;
import org.deeplearning4j.nn.gradient.DefaultGradient;
import org.deeplearning4j.nn.gradient.Gradient;
import org.deeplearning4j.nn.params.DefaultParamInitializer;
import org.deeplearning4j.nn.workspace.ArrayType;
import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
import org.deeplearning4j.optimize.Solver;
import org.deeplearning4j.optimize.api.ConvexOptimizer;
import org.deeplearning4j.optimize.api.TrainingListener;
import org.nd4j.common.primitives.Pair;
import org.nd4j.evaluation.IEvaluation;
import org.nd4j.linalg.api.buffer.DataType;
import org.nd4j.linalg.api.memory.MemoryWorkspace;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.api.ops.impl.transforms.custom.LayerNorm;
import org.nd4j.linalg.api.ops.impl.transforms.custom.LayerNormBp;
import org.nd4j.linalg.dataset.api.DataSet;
import org.nd4j.linalg.dataset.api.MultiDataSet;
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
import org.nd4j.linalg.dataset.api.iterator.MultiDataSetIterator;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.indexing.NDArrayIndex;
import org.nd4j.linalg.learning.regularization.Regularization;

/** A layer with parameters */
@Slf4j
public abstract class BaseLayer<LayerConfT extends BaseLayerConfiguration>
    extends AbstractLayer<LayerConfT> implements ITrainableLayer {

  protected double score = 0.0;
  protected ConvexOptimizer optimizer;
  protected Gradient gradient;
  protected Solver solver;
  protected Map<String, INDArray> weightNoiseParams = new HashMap<>();
  protected INDArray paramsFlattened;
  protected INDArray gradientsFlattened;

  @Getter @Setter protected Map<String, INDArray> paramTable;

  @Getter protected transient Map<String, INDArray> gradientViews;

  /**
   * we put this as a virtual function to access the models paramTable. @Getter @Setter private
   * INDArray params;
   */
  public BaseLayer(LayerConfiguration conf, DataType dataType) {
    super(conf, dataType);
  }

  /**
   * This method executes evaluation of the model against given iterator and evaluation
   * implementations
   *
   * @param iterator
   * @param evaluations
   */
  @Override
  public <T extends IEvaluation> T[] doEvaluation(DataSetIterator iterator, T... evaluations) {
    return null;
  }

  /**
   * This method executes evaluation of the model against given iterator and evaluation
   * implementations
   *
   * @param iterator
   * @param evaluations
   */
  @Override
  public <T extends IEvaluation> T[] doEvaluation(MultiDataSetIterator iterator, T... evaluations) {
    return null;
  }

  /** Init the model */
  @Override
  public void init() {}

  /**
   * Update layer weights and biases with gradient change
   *
   * @param gradient
   */
  @Override
  public void update(Gradient gradient) {}

  /**
   * Perform one update applying the gradient
   *
   * @param gradient the gradient to apply
   * @param paramType
   */
  @Override
  public void update(INDArray gradient, String paramType) {}

  /**
   * the number of parameters for the model
   *
   * @return the number of parameters for the model
   */
  @Override
  public long numParams() {
    int ret = 0;
    for (INDArray val : paramTable.values()) ret += val.length();
    return ret;
  }

  /**
   * The current inputs batch size
   *
   * @return the current inputs batch size
   */
  @Override
  public int batchSize() {
    return 0;
  }

  /**
   * Set the {@link TrainingListener}s for this model. If any listeners have previously been set,
   * they will be replaced by this method
   *
   * @param listeners
   */
  @Override
  public void addTrainingListeners(TrainingListener... listeners) {
    addTrainingListeners(List.of(listeners));
  }

  /**
   * Set the parameters for a given parameter type.
   *
   * @param key the param type key to set
   * @param val the new parameters ndarray
   */
  @Override
  public void setParam(String key, INDArray val) {
    if (paramTable.containsKey(key)) {
      paramTable.get(key).assign(val);
    } else {
      paramTable.put(key, val);
    }
  }

  @Override
  public INDArray getParam(String param) {
    return paramTable.get(param);
  }

  @Override
  public void setParams(INDArray params) {
    if (params == paramsFlattened) return; // no op
    setParams(params, 'f');
  }

  /**
   * * The AbstractLayer does not implement Params, ParamTable and GradientView. A RuntimeException
   * * will be triggered when calling this.
   *
   * @return 1d parameter vector
   */
  @Override
  public INDArray getParams() {
    return paramsFlattened;
  }

  /** */
  @Override
  public void close() {}

  /**
   * This method fits model with a given DataSet
   *
   * @param dataSet
   */
  @Override
  public void fit(DataSet dataSet) {}

  /**
   * This method fits model with a given MultiDataSet
   *
   * @param dataSet
   */
  @Override
  public void fit(MultiDataSet dataSet) {}

  /**
   * This method fits model with a given DataSetIterator
   *
   * @param iterator
   */
  @Override
  public void fit(DataSetIterator iterator) {}

  /**
   * This method fits model with a given MultiDataSetIterator
   *
   * @param iterator
   */
  @Override
  public void fit(MultiDataSetIterator iterator) {}

  /**
   * This method returns updater state (if applicable), null otherwise
   *
   * @return
   */
  @Override
  public INDArray updaterState() {
    return null;
  }

  /**
   * and others even use \epsilon (epsilon)
   * http://web.cs.swarthmore.edu/~meeden/cs81/s10/BackPropDeriv.pdf
   *
   * @param epsilon w^(L+1)*delta^(L+1). Or, equiv: dC/da, i.e., (dC/dz)*(dz/da) = dC/da, where C is
   *     cost function a=sigma(z) is activation.
   * @param workspaceMgr Workspace manager
   * @return
   */
  @Override
  public Pair<Gradient, INDArray> backpropGradient(
      INDArray epsilon, LayerWorkspaceMgr workspaceMgr) {
    assertInputSet(true);
    // If this layer is layer L, then epsilon is (w^(L+1)*(d^(L+1))^T) (or equivalent)
    Pair<INDArray, INDArray> zAndPreNorm = preOutputWithPreNorm(true, true, workspaceMgr);
    INDArray z =
        zAndPreNorm.getFirst(); // Note: using preOutput(INDArray) can't be used as this does a
    // setInput(input) and resets the 'appliedDropout' flag
    INDArray preNorm = zAndPreNorm.getSecond();
    INDArray delta =
        getTypedLayerConfiguration()
            .getActivation()
            .backprop(z, epsilon)
            .getFirst(); // TODO handle activation function params

    if (maskArray != null) {
      applyMask(delta);
    }

    Gradient ret = new DefaultGradient();

    if (hasBias()) {
      INDArray biasGrad = gradientViews.get(DefaultParamInitializer.BIAS_KEY);
      delta.sum(biasGrad, 0); // biasGrad is initialized/zeroed first
      ret.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, biasGrad);
    }

    INDArray W = getParamWithNoise(DefaultParamInitializer.WEIGHT_KEY, true, workspaceMgr);

    INDArray epsilonNext =
        workspaceMgr.createUninitialized(
            ArrayType.ACTIVATION_GRAD,
            delta.dataType(),
            new long[] {W.size(0), delta.size(0)},
            'f');
    if (hasLayerNorm()) {
      INDArray g = getParam(DefaultParamInitializer.GAIN_KEY);

      INDArray dldg = gradientViews.get(DefaultParamInitializer.GAIN_KEY);
      Nd4j.getExecutioner().exec(new LayerNormBp(preNorm, g, delta, delta, dldg, true, 1));
      ret.gradientForVariable().put(DefaultParamInitializer.GAIN_KEY, dldg);
    }

    epsilonNext =
        W.mmuli(delta.transpose(), epsilonNext)
            .transpose(); // W.mmul(delta.transpose()).transpose();

    INDArray weightGrad = gradientViews.get(DefaultParamInitializer.WEIGHT_KEY); // f order
    Nd4j.gemm(
        getInput().castTo(weightGrad.dataType()),
        delta,
        weightGrad,
        true,
        false,
        1.0,
        0.0); // TODO avoid castTo?
    ret.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, weightGrad);

    weightNoiseParams.clear();

    epsilonNext = backpropDropOutIfPresent(epsilonNext);
    return new Pair<>(ret, epsilonNext);
  }

  public void computeGradientAndScore(LayerWorkspaceMgr workspaceMgr) {
    if (getInput() == null) {
      log.warn("There is no input for this layer '{}'", layerConfiguration);
      return;
    }
    INDArray output = activate(true, workspaceMgr);
    setScoreWithZ(output);
  }

  protected void setScoreWithZ(INDArray z) {}

  /**
   * Objective function: the specified objective
   *
   * @return the score for the objective
   */
  @Override
  public double getScore() {
    return score;
  }

  @Override
  public Gradient gradient() {
    return gradient;
  }

  @Override
  public ConvexOptimizer getOptimizer() {
    if (optimizer == null) {
      Solver solver = new Solver.Builder().model(this).configure(getNetConfiguration()).build();
      this.optimizer = solver.getOptimizer();
    }
    return optimizer;
  }

  /**
   * Returns the parameters of the neural network as a flattened row vector
   *
   * @return the parameters of the neural network
   */
  @Override
  public INDArray getModelParams() {
    return paramsFlattened;
  }

  public void setParamsTable(INDArray paramsTable) {
    if (paramsTable == paramsFlattened) {
      return; // no op
    }
    setParams(paramsTable, 'f');
  }

  protected void setParams(INDArray params, char order) {
    if (params == null) {
      log.trace(
          "setParams(INDArray params, char order): params is null. Skipping setParams in Layer {}[{}] at index {}",
          getLayerConfiguration().getName(),
          getClass().getSimpleName(),
          getIndex());
      return;
    }
    Set<String> parameterList = layerConfiguration.getVariables(); // netWideVariables();
    int length = 0;
    for (String s : parameterList) {
      length += getParam(s).length();
    }
    if (params.length() != length) {
      throw new IllegalArgumentException(
          "Unable to set parameters: must be of length "
              + length
              + ", got params of length "
              + params.length()
              + " - "
              + layerId());
    }
    int idx = 0;
    Set<String> paramKeySet = this.getParamTable().keySet();
    for (String s : paramKeySet) {
      INDArray param = getParam(s);
      INDArray get =
          params.get(NDArrayIndex.point(0), NDArrayIndex.interval(idx, idx + param.length()));
      if (param.length() != get.length()) {
        throw new IllegalStateException(
            "Parameter "
                + s
                + " should have been of length "
                + param.length()
                + " but was "
                + get.length()
                + " - "
                + layerId());
      }
      param.assign(
          get.reshape(
              order,
              param.shape())); // Use assign due to backprop params being a view of a larger array
      idx += param.length();
    }
  }

  @Override
  public void setParamsViewArray(INDArray params) {
    if (this.getParamTable() != null && params.length() != numParams()) {
      throw new IllegalArgumentException(
          "Invalid input: expect params of length "
              + numParams()
              + ", got params of length "
              + params.length()
              + " - "
              + layerId());
    }
    this.paramsFlattened = params;
  }

  @Override
  public Map<String, INDArray> getParamTable(boolean isBackprop) {
    return paramTable;
  }

  @Override
  public INDArray getGradientsViewArray() {
    return gradientsFlattened;
  }

  @Override
  public void setBackpropGradientsViewArray(INDArray gradients) {
    if (this.getParamTable() != null && gradients.length() != numParams()) {
      throw new IllegalArgumentException(
          "Invalid input: expect gradients array of length "
              + numParams(true)
              + ", got array of length "
              + gradients.length()
              + " - "
              + layerId());
    }

    this.gradientsFlattened = gradients;
    this.gradientViews =
        layerConfiguration.initializer().getGradientsFromFlattened(layerConfiguration, gradients);
  }

  /**
   * Get the parameter, after applying any weight noise (such as DropConnect) if necessary. Note
   * that during training, this will store the post-noise parameters, as these should be used for
   * both forward pass and backprop, for a single iteration. Consequently, the parameters (post
   * noise) should be cleared after each training iteration
   *
   * @param param Parameter key
   * @param training If true: during training
   * @return The parameter, after applying any noise
   */
  protected INDArray getParamWithNoise(
      @NonNull String param, boolean training, @NonNull LayerWorkspaceMgr workspaceMgr) {
    INDArray p;
    LayerConfiguration lconf = getLayerConfiguration();
    if (lconf.getWeightNoise() != null) {
      if (training && weightNoiseParams.size() > 0 && weightNoiseParams.containsKey(param)) {
        // Re-use these weights for both forward pass and backprop - don't want to use 2 different
        // params here
        // These should be cleared during  backprop
        return weightNoiseParams.get(param);
      } else {
        try (MemoryWorkspace ws = Nd4j.getMemoryManager().scopeOutOfWorkspaces()) {
          p =
              lconf
                  .getWeightNoise()
                  .getParameter(
                      this, param, getIterationCount(), getEpochCount(), training, workspaceMgr);
        }
      }

      if (training) {
        // Store for re-use in backprop
        weightNoiseParams.put(param, p);
      }
    } else {
      return getParam(param);
    }

    return p;
  }

  protected INDArray preOutput(boolean training, LayerWorkspaceMgr workspaceMgr) {
    return preOutputWithPreNorm(training, false, workspaceMgr).getFirst();
  }

  protected Pair<INDArray, INDArray> preOutputWithPreNorm(
      boolean training, boolean forBackprop, @NonNull LayerWorkspaceMgr workspaceMgr) {
    assertInputSet(forBackprop);
    applyDropOutIfNecessary(training, workspaceMgr);
    INDArray W = getParamWithNoise(DefaultParamInitializer.WEIGHT_KEY, training, workspaceMgr);
    INDArray b = getParamWithNoise(DefaultParamInitializer.BIAS_KEY, training, workspaceMgr);
    INDArray g = (hasLayerNorm() ? getParam(DefaultParamInitializer.GAIN_KEY) : null);

    INDArray input = getInput().castTo(dataType);

    // Input validation:
    if (input.rank() != 2 || input.columns() != W.rows()) {
      if (input.rank() != 2) {
        throw new DL4JInvalidInputException(
            "Input that is not a matrix; expected matrix (rank 2), got rank "
                + input.rank()
                + " array with shape "
                + Arrays.toString(input.shape())
                + ". Missing preprocessor or wrong input type? "
                + layerId());
      }
      throw new DL4JInvalidInputException(
          "Input size ("
              + input.columns()
              + " columns; shape = "
              + Arrays.toString(input.shape())
              + ") is invalid: does not match layer input size (layer # inputs = "
              + W.size(0)
              + ") "
              + layerId());
    }

    INDArray ret =
        workspaceMgr.createUninitialized(
            ArrayType.ACTIVATIONS, W.dataType(), input.size(0), W.size(1));
    input
        .castTo(ret.dataType())
        .mmuli(
            W, ret); // TODO Can we avoid this cast? (It sohuld be a no op if not required, however)

    INDArray preNorm = ret;
    if (hasLayerNorm()) {
      preNorm = (forBackprop ? ret.dup(ret.ordering()) : ret);
      Nd4j.getExecutioner().exec(new LayerNorm(preNorm, g, ret, true, 1));
    }

    if (hasBias()) {
      ret.addiRowVector(b);
    }

    if (maskArray != null) {
      applyMask(ret);
    }

    return new Pair<>(ret, preNorm);
  }

  @Override
  public INDArray activate(boolean training, LayerWorkspaceMgr workspaceMgr) {
    INDArray z = preOutput(training, workspaceMgr); // (Input*Weights) + bias
    INDArray ret = getTypedLayerConfiguration().getActivationFn().getActivation(z, training);

    if (maskArray != null) {
      applyMask(ret);
    }

    return ret;
  }

  @Override
  public INDArray activate(
      @NonNull INDArray input, boolean training, @NonNull LayerWorkspaceMgr workspaceMgr) {
    setInput(input, workspaceMgr);
    return activate(training, workspaceMgr);
  }

  @Override
  public double calcRegularizationScore(boolean backpropParamsOnly) {
    double scoreSum = 0.0;
    for (Map.Entry<String, INDArray> e : getParamTable().entrySet()) {
      List<Regularization> l = getTypedLayerConfiguration().getRegularizationByParam(e.getKey());
      if (l == null || l.isEmpty()) {
        continue;
      }
      for (Regularization r : l) {
        scoreSum += r.score(e.getValue(), getIterationCount(), getEpochCount());
      }
    }
    return scoreSum;
  }

  @Override
  public Layer clone() {
    Layer layer = null;
    try {
      Constructor c = getClass().getConstructor(NeuralNetConfiguration.class);
      layer = (Layer) c.newInstance(layerConfiguration);
      Map<String, INDArray> linkedTable = new LinkedHashMap<>();
      for (Map.Entry<String, INDArray> entry : getParamTable().entrySet()) {
        linkedTable.put(entry.getKey(), entry.getValue().dup());
      }
      layer.setParamTable(linkedTable);
    } catch (Exception e) {
      log.error("", e);
    }

    return layer;
  }

  @Override
  public void fit(INDArray input, LayerWorkspaceMgr workspaceMgr) {
    if (input != null) {
      setInput(input, workspaceMgr);
      applyDropOutIfNecessary(true, workspaceMgr);
    }
    if (solver == null) {
      solver =
          new Solver.Builder()
              .model(this)
              .configure(getNetConfiguration())
              .listeners(getTrainingListeners())
              .build();
    }
    this.optimizer = solver.getOptimizer();
    solver.optimize(workspaceMgr);
  }

  @Override
  public String toString() {
    return getClass().getName()
        + "{"
        + "conf="
        + layerConfiguration
        + ", score="
        + score
        + ", optimizer="
        + optimizer
        + ", listeners="
        + trainingListeners
        + '}';
  }

  @Override
  public void clear() {
    super.clear();
    weightNoiseParams.clear();
  }

  @Override
  public void clearNoiseWeightParams() {
    weightNoiseParams.clear();
  }

  /**
   * Does this layer have no bias term? Many layers (dense, convolutional, output, embedding) have
   * biases by default, but no-bias versions are possible via configuration
   *
   * @return True if a bias term is present, false otherwise
   */
  public boolean hasBias() {
    // Overridden by layers supporting no bias mode: dense, output, convolutional, embedding
    //return true;
    return true;
  }

  /**
   * Does this layer support and is it enabled layer normalization? Only Dense and SimpleRNN Layers
   * support layer normalization.
   *
   * @return True if layer normalization is enabled on this layer, false otherwise
   */
  public boolean hasLayerNorm() {
    // Overridden by layers supporting layer normalization.
    return false;
  }
}