From 03b9a34bc0acc00843dc7b175b4782a8f4bdbd09 Mon Sep 17 00:00:00 2001
From: ilan-gold <ilanbassgold@gmail.com>
Date: Thu, 27 Jan 2022 18:25:52 +0100
Subject: [PATCH 1/4] Remove TF1 and TF2 Tests

---
 batchglm/api/models/__init__.py               |   11 +-
 batchglm/api/models/tf1/__init__.py           |    3 -
 batchglm/api/models/tf1/glm_beta.py           |    2 -
 batchglm/api/models/tf1/glm_nb.py             |    2 -
 batchglm/api/models/tf1/glm_norm.py           |    2 -
 batchglm/train/tf1/README.md                  |   92 --
 batchglm/train/tf1/__init__.py                |    0
 batchglm/train/tf1/base/__init__.py           |    3 -
 batchglm/train/tf1/base/estimator.py          |  342 ----
 batchglm/train/tf1/base/estimator_graph.py    |   15 -
 batchglm/train/tf1/base/external.py           |    2 -
 batchglm/train/tf1/base/model.py              |   39 -
 batchglm/train/tf1/base_glm/README.md         |    2 -
 batchglm/train/tf1/base_glm/__init__.py       |    6 -
 .../train/tf1/base_glm/estimator_graph.py     | 1394 -----------------
 batchglm/train/tf1/base_glm/external.py       |    3 -
 batchglm/train/tf1/base_glm/fim.py            |   67 -
 batchglm/train/tf1/base_glm/hessians.py       |  100 --
 batchglm/train/tf1/base_glm/jacobians.py      |   72 -
 batchglm/train/tf1/base_glm/model.py          |  166 --
 .../train/tf1/base_glm/reducible_tensors.py   |  351 -----
 batchglm/train/tf1/base_glm_all/README.md     |    2 -
 batchglm/train/tf1/base_glm_all/__init__.py   |    6 -
 batchglm/train/tf1/base_glm_all/estimator.py  |  362 -----
 .../train/tf1/base_glm_all/estimator_graph.py |  543 -------
 batchglm/train/tf1/base_glm_all/external.py   |   12 -
 .../train/tf1/base_glm_all/external_beta.py   |    6 -
 .../train/tf1/base_glm_all/external_nb.py     |    6 -
 .../train/tf1/base_glm_all/external_norm.py   |    6 -
 batchglm/train/tf1/base_glm_all/fim.py        |  115 --
 batchglm/train/tf1/base_glm_all/hessians.py   |  193 ---
 batchglm/train/tf1/base_glm_all/jacobians.py  |  103 --
 .../tf1/base_glm_all/reducible_tensors.py     |   99 --
 batchglm/train/tf1/external.py                |    1 -
 batchglm/train/tf1/glm_beta/__init__.py       |    7 -
 batchglm/train/tf1/glm_beta/estimator.py      |  291 ----
 .../train/tf1/glm_beta/estimator_graph.py     |   12 -
 batchglm/train/tf1/glm_beta/external.py       |   18 -
 batchglm/train/tf1/glm_beta/fim.py            |   25 -
 batchglm/train/tf1/glm_beta/hessians.py       |   92 --
 batchglm/train/tf1/glm_beta/jacobians.py      |   40 -
 batchglm/train/tf1/glm_beta/model.py          |  133 --
 .../train/tf1/glm_beta/reducible_tensors.py   |   13 -
 .../train/tf1/glm_beta/training_strategies.py |   37 -
 batchglm/train/tf1/glm_nb/__init__.py         |    7 -
 batchglm/train/tf1/glm_nb/estimator.py        |  152 --
 batchglm/train/tf1/glm_nb/estimator_graph.py  |   12 -
 batchglm/train/tf1/glm_nb/external.py         |   17 -
 batchglm/train/tf1/glm_nb/fim.py              |   43 -
 batchglm/train/tf1/glm_nb/hessians.py         |   93 --
 batchglm/train/tf1/glm_nb/jacobians.py        |   66 -
 batchglm/train/tf1/glm_nb/model.py            |  136 --
 .../train/tf1/glm_nb/reducible_tensors.py     |   13 -
 .../train/tf1/glm_nb/training_strategies.py   |   27 -
 batchglm/train/tf1/glm_norm/__init__.py       |    7 -
 batchglm/train/tf1/glm_norm/estimator.py      |  325 ----
 .../train/tf1/glm_norm/estimator_graph.py     |   12 -
 batchglm/train/tf1/glm_norm/external.py       |   18 -
 batchglm/train/tf1/glm_norm/fim.py            |   28 -
 batchglm/train/tf1/glm_norm/hessians.py       |   66 -
 batchglm/train/tf1/glm_norm/jacobians.py      |   41 -
 batchglm/train/tf1/glm_norm/model.py          |  138 --
 .../train/tf1/glm_norm/reducible_tensors.py   |   13 -
 .../train/tf1/glm_norm/training_strategies.py |   27 -
 batchglm/train/tf1/ops.py                     |   59 -
 batchglm/train/tf1/train.py                   |  315 ----
 .../unit_test/test_acc_analytic_glm_all.py    |  373 -----
 .../test_acc_constrained_vglm_all.py          |  140 --
 batchglm/unit_test/test_acc_glm_all.py        |  528 -------
 batchglm/unit_test/test_acc_glm_all_tf2.py    |  524 -------
 .../unit_test/test_acc_sizefactors_glm_all.py |  103 --
 batchglm/unit_test/test_hessians_glm_all.py   |  187 ---
 batchglm/unit_test/test_jacobians_glm_all.py  |  192 ---
 .../unit_test/test_jacobians_glm_all_tf2.py   |  186 ---
 batchglm/unit_test/test_simulators_glm_all.py |  128 --
 75 files changed, 1 insertion(+), 8771 deletions(-)
 delete mode 100644 batchglm/api/models/tf1/__init__.py
 delete mode 100644 batchglm/api/models/tf1/glm_beta.py
 delete mode 100644 batchglm/api/models/tf1/glm_nb.py
 delete mode 100644 batchglm/api/models/tf1/glm_norm.py
 delete mode 100644 batchglm/train/tf1/README.md
 delete mode 100644 batchglm/train/tf1/__init__.py
 delete mode 100644 batchglm/train/tf1/base/__init__.py
 delete mode 100644 batchglm/train/tf1/base/estimator.py
 delete mode 100644 batchglm/train/tf1/base/estimator_graph.py
 delete mode 100644 batchglm/train/tf1/base/external.py
 delete mode 100644 batchglm/train/tf1/base/model.py
 delete mode 100644 batchglm/train/tf1/base_glm/README.md
 delete mode 100644 batchglm/train/tf1/base_glm/__init__.py
 delete mode 100644 batchglm/train/tf1/base_glm/estimator_graph.py
 delete mode 100644 batchglm/train/tf1/base_glm/external.py
 delete mode 100644 batchglm/train/tf1/base_glm/fim.py
 delete mode 100644 batchglm/train/tf1/base_glm/hessians.py
 delete mode 100644 batchglm/train/tf1/base_glm/jacobians.py
 delete mode 100644 batchglm/train/tf1/base_glm/model.py
 delete mode 100644 batchglm/train/tf1/base_glm/reducible_tensors.py
 delete mode 100644 batchglm/train/tf1/base_glm_all/README.md
 delete mode 100644 batchglm/train/tf1/base_glm_all/__init__.py
 delete mode 100644 batchglm/train/tf1/base_glm_all/estimator.py
 delete mode 100644 batchglm/train/tf1/base_glm_all/estimator_graph.py
 delete mode 100644 batchglm/train/tf1/base_glm_all/external.py
 delete mode 100644 batchglm/train/tf1/base_glm_all/external_beta.py
 delete mode 100644 batchglm/train/tf1/base_glm_all/external_nb.py
 delete mode 100644 batchglm/train/tf1/base_glm_all/external_norm.py
 delete mode 100644 batchglm/train/tf1/base_glm_all/fim.py
 delete mode 100644 batchglm/train/tf1/base_glm_all/hessians.py
 delete mode 100644 batchglm/train/tf1/base_glm_all/jacobians.py
 delete mode 100644 batchglm/train/tf1/base_glm_all/reducible_tensors.py
 delete mode 100644 batchglm/train/tf1/external.py
 delete mode 100644 batchglm/train/tf1/glm_beta/__init__.py
 delete mode 100644 batchglm/train/tf1/glm_beta/estimator.py
 delete mode 100644 batchglm/train/tf1/glm_beta/estimator_graph.py
 delete mode 100644 batchglm/train/tf1/glm_beta/external.py
 delete mode 100644 batchglm/train/tf1/glm_beta/fim.py
 delete mode 100644 batchglm/train/tf1/glm_beta/hessians.py
 delete mode 100644 batchglm/train/tf1/glm_beta/jacobians.py
 delete mode 100644 batchglm/train/tf1/glm_beta/model.py
 delete mode 100644 batchglm/train/tf1/glm_beta/reducible_tensors.py
 delete mode 100644 batchglm/train/tf1/glm_beta/training_strategies.py
 delete mode 100644 batchglm/train/tf1/glm_nb/__init__.py
 delete mode 100644 batchglm/train/tf1/glm_nb/estimator.py
 delete mode 100644 batchglm/train/tf1/glm_nb/estimator_graph.py
 delete mode 100644 batchglm/train/tf1/glm_nb/external.py
 delete mode 100644 batchglm/train/tf1/glm_nb/fim.py
 delete mode 100644 batchglm/train/tf1/glm_nb/hessians.py
 delete mode 100644 batchglm/train/tf1/glm_nb/jacobians.py
 delete mode 100644 batchglm/train/tf1/glm_nb/model.py
 delete mode 100644 batchglm/train/tf1/glm_nb/reducible_tensors.py
 delete mode 100644 batchglm/train/tf1/glm_nb/training_strategies.py
 delete mode 100644 batchglm/train/tf1/glm_norm/__init__.py
 delete mode 100644 batchglm/train/tf1/glm_norm/estimator.py
 delete mode 100644 batchglm/train/tf1/glm_norm/estimator_graph.py
 delete mode 100644 batchglm/train/tf1/glm_norm/external.py
 delete mode 100644 batchglm/train/tf1/glm_norm/fim.py
 delete mode 100644 batchglm/train/tf1/glm_norm/hessians.py
 delete mode 100644 batchglm/train/tf1/glm_norm/jacobians.py
 delete mode 100644 batchglm/train/tf1/glm_norm/model.py
 delete mode 100644 batchglm/train/tf1/glm_norm/reducible_tensors.py
 delete mode 100644 batchglm/train/tf1/glm_norm/training_strategies.py
 delete mode 100644 batchglm/train/tf1/ops.py
 delete mode 100644 batchglm/train/tf1/train.py
 delete mode 100644 batchglm/unit_test/test_acc_analytic_glm_all.py
 delete mode 100644 batchglm/unit_test/test_acc_constrained_vglm_all.py
 delete mode 100644 batchglm/unit_test/test_acc_glm_all.py
 delete mode 100644 batchglm/unit_test/test_acc_glm_all_tf2.py
 delete mode 100644 batchglm/unit_test/test_acc_sizefactors_glm_all.py
 delete mode 100644 batchglm/unit_test/test_hessians_glm_all.py
 delete mode 100644 batchglm/unit_test/test_jacobians_glm_all.py
 delete mode 100644 batchglm/unit_test/test_jacobians_glm_all_tf2.py
 delete mode 100644 batchglm/unit_test/test_simulators_glm_all.py

diff --git a/batchglm/api/models/__init__.py b/batchglm/api/models/__init__.py
index b6c68fb0..eff3c3f2 100644
--- a/batchglm/api/models/__init__.py
+++ b/batchglm/api/models/__init__.py
@@ -1,14 +1,5 @@
 from . import numpy
 try:
-    import tensorflow as tf
-    if tf.__version__.split(".")[0] == "1":
-        from . import tf1
-    else:
-        tf1 = None
-    if tf.__version__.split(".")[0] == "2":
-        from . import tf2
-    else:
-        tf2 = None
+    from . import tf2
 except ImportError:
-    tf1 = None
     tf2 = None
diff --git a/batchglm/api/models/tf1/__init__.py b/batchglm/api/models/tf1/__init__.py
deleted file mode 100644
index 8fbdb228..00000000
--- a/batchglm/api/models/tf1/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from . import glm_beta
-from . import glm_nb
-from . import glm_norm
diff --git a/batchglm/api/models/tf1/glm_beta.py b/batchglm/api/models/tf1/glm_beta.py
deleted file mode 100644
index ce7e336c..00000000
--- a/batchglm/api/models/tf1/glm_beta.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from batchglm.models.glm_beta import InputDataGLM, Model, Simulator
-from batchglm.train.tf1.glm_beta import Estimator
\ No newline at end of file
diff --git a/batchglm/api/models/tf1/glm_nb.py b/batchglm/api/models/tf1/glm_nb.py
deleted file mode 100644
index fc0f72ab..00000000
--- a/batchglm/api/models/tf1/glm_nb.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from batchglm.models.glm_nb import InputDataGLM, Model, Simulator
-from batchglm.train.tf1.glm_nb import Estimator
\ No newline at end of file
diff --git a/batchglm/api/models/tf1/glm_norm.py b/batchglm/api/models/tf1/glm_norm.py
deleted file mode 100644
index 7dc1ce0f..00000000
--- a/batchglm/api/models/tf1/glm_norm.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from batchglm.models.glm_norm import InputDataGLM, Model, Simulator
-from batchglm.train.tf1.glm_norm import Estimator
\ No newline at end of file
diff --git a/batchglm/train/tf1/README.md b/batchglm/train/tf1/README.md
deleted file mode 100644
index 3bf0d9f0..00000000
--- a/batchglm/train/tf1/README.md
+++ /dev/null
@@ -1,92 +0,0 @@
-Implementation of models using Tensorflow
-====
-This module contains all model estimators depending on Tensorflow.
-
-
-Template to implement a new model estimator:
-----
-First, set up a parameter definition defining all model parameters together with the corresponding dimensions:
-```python
-PARAMS = {
-    "param_1": ("samples", "variables"),
-    "param_2": ("variables",),
-    ...
-}
-```
-All equally-named dimensions have to be of the same size.
-
-Create a Tensorflow model with all necessary parameters:
-```python
-from impl.tf.base import TFEstimatorGraph
-
-class EstimatorGraph(TFEstimatorGraph):
-    def __init__(self, graph):
-        TFEstimatorGraph.__init__(self, graph)
-        # required by TFEstimatorGraph
-        self.global_step = tf.train.get_or_create_global_step()
-        self.init_op = ...
-        self.loss = ...
-        self.train_op = ...
-        # parameters:
-        self.param_1 = ...
-        self.param_2 = ...
-        
-```
-Now create the actual Estimator for the given model:
-```python
-from models.<some_model> import AbstractEstimator
-from impl.tf.base import MonitoredTFEstimator
-
-class SomeEstimator(AbstractEstimator, MonitoredTFEstimator, metaclass=abc.ABCMeta):
-    model: EstimatorGraph
-    
-    # Set up a PARAMS property returning the previously created parameter definition:
-    #   This property is used among other things for exporting data to NetCDF-format.
-    @property
-    def PARAMS(cls) -> dict:
-        return PARAMS
-    
-    def __init__(self, input_data, model=None):
-        if model is None:
-            tf.reset_default_graph()
-            # create model
-            model = EstimatorGraph(graph=tf.get_default_graph())
-        
-        MonitoredTFEstimator.__init__(self, input_data, model)
-    
-    # The scaffold provides some information about the model graph to the training session.
-    #   It is possible to add additional capabilities like a summary_op which writes summaries for TensorBoard 
-    tf1
-    def _scaffold(self):
-        with self.model.graph.as_default():
-            scaffold = tf.train.Scaffold(
-                init_op=self.model.init_op,
-                summary_op=self.model.merged_summary,
-                saver=self.model.saver,
-            )
-        return scaffold
-    
-    # Overwrite this method if you would like to feed additional data during the training
-    def train(self, *args, learning_rate=0.05, **kwargs):
-        tf.logging.info("learning rate: %s" % learning_rate)
-        super().train(feed_dict={"learning_rate:0": learning_rate})
-    
-    # Now define all parameters requested by this model 
-    #   (defined in model.<some_model>.AbstractEstimator)
-    @property
-    def param_1(self):
-        return self.get("param_1") # equal to self.run(self.model.param_1)
-    @property
-    def param_2(self):
-        return self.get("param_2") # equal to self.run(self.model.param_2)
-    
-```
-
-Some additional notes:
-- estimator.get("param_1") == estimator.session.run(estimator.model.param_1)
-- estimator.to_xarray(param_list) needs the PARAMS definition to export the estimated parameters as  
-    xarray.Dataset()
-- All necessary parameters should be directly exposed as parameter tensors in EstimatorGraph 
-    (e.g. EstimatorGraph().param_1) with correct shapes as defined in PARAMS. 
-    However, this property is currently not validated automatically.
-
diff --git a/batchglm/train/tf1/__init__.py b/batchglm/train/tf1/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/batchglm/train/tf1/base/__init__.py b/batchglm/train/tf1/base/__init__.py
deleted file mode 100644
index 67d248f6..00000000
--- a/batchglm/train/tf1/base/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .estimator import _TFEstimator
-from .estimator_graph import TFEstimatorGraph
-from .model import ProcessModelBase
\ No newline at end of file
diff --git a/batchglm/train/tf1/base/estimator.py b/batchglm/train/tf1/base/estimator.py
deleted file mode 100644
index 5f5b6359..00000000
--- a/batchglm/train/tf1/base/estimator.py
+++ /dev/null
@@ -1,342 +0,0 @@
-import abc
-from enum import Enum
-import logging
-import numpy as np
-import pprint
-import tensorflow as tf
-import time
-from typing import Dict, Any, Union, Iterable
-
-from .external import _EstimatorBase, pkg_constants
-
-logger = logging.getLogger("batchglm")
-
-
-class TFEstimatorGraph(metaclass=abc.ABCMeta):
-    graph: tf.Graph
-    loss: tf.Tensor
-    init_op: tf.Tensor
-    train_op: tf.Tensor
-    global_step: tf.Tensor
-
-    def __init__(self, graph=None):
-        if graph is None:
-            graph = tf.Graph()
-        self.graph = graph
-
-
-class _TFEstimator(metaclass=abc.ABCMeta):
-
-    session: tf.compat.v1.Session
-    feed_dict: Dict[Union[Union[tf.Tensor, tf.Operation], Any], Any]
-    _param_decorators: Dict[str, callable]
-
-    def __init__(
-            self
-    ):
-        self.session = None
-        self.feed_dict = {}
-        self._param_decorators = dict()
-
-    def initialize(self):
-        self.close_session()
-        self.feed_dict = {}
-        with self.model.graph.as_default():
-            # set up session parameters
-            self.session = tf.compat.v1.Session(config=pkg_constants.TF_CONFIG_PROTO)
-            self.session.run(self._scaffold().init_op, feed_dict=self.feed_dict)
-
-    def close_session(self):
-        if self.session is None:
-            return False
-        try:
-            self.session.close()
-            return True
-        except (tf.errors.OpError, RuntimeError):
-            return False
-
-    def run(self, tensor, feed_dict=None):
-        if feed_dict is None:
-            feed_dict = self.feed_dict
-
-        return self.session.run(tensor, feed_dict=feed_dict)
-
-    @abc.abstractmethod
-    def _scaffold(self) -> tf.compat.v1.train.Scaffold:
-        """
-        Should create a training scaffold for this Estimator's model
-
-        :return: tf1.compat.v1.train.Scaffold object
-        """
-        pass
-
-    def _get_unsafe(self, key: Union[str, Iterable]) -> Union[Any, Dict[str, Any]]:
-        if isinstance(key, str):
-            return self.run(self.model.__getattribute__(key))
-        elif isinstance(key, Iterable):
-            d = {s: self.model.__getattribute__(s) for s in key}
-            return self.run(d)
-
-    def get(self, key: Union[str, Iterable]) -> Union[Any, Dict[str, Any]]:
-        """
-        Returns the values of the tensor(s) specified by key.
-
-        :param key: Either a string or an iterable list/set/tuple/etc. of strings
-        :return: Single array if `key` is a string or a dict {k: value} of arrays if `key` is a collection of strings
-        """
-        if isinstance(key, str):
-            if key not in self.param_shapes():
-                raise ValueError("Unknown parameter %s" % key)
-        elif isinstance(key, Iterable):
-            for k in list(key):
-                if k not in self.param_shapes():
-                    raise ValueError("Unknown parameter %s" % k)
-        return self._get_unsafe(key)
-
-    @property
-    def global_step(self):
-        return self._get_unsafe("global_step")
-
-    @property
-    def loss(self):
-        return self._get_unsafe("loss")
-
-    def _train(
-            self,
-            *args,
-            learning_rate=None,
-            feed_dict=None,
-            convergence_criteria="all_converged",
-            stopping_criteria=None,
-            train_op=None,
-            trustregion_mode=False,
-            require_hessian=False,
-            require_fim=False,
-            is_batched=False,
-            **kwargs
-    ):
-        """
-        Starts training of the model
-
-        :param feed_dict: dict of values which will be feeded each `session.run()`
-
-            See also feed_dict parameter of `session.run()`.
-        :param convergence_criteria: criteria after which the training will be interrupted.
-
-            Currently implemented criterias:
-
-            - "step":
-              stop, when the step counter reaches `stopping_criteria`
-        :param stopping_criteria: Additional parameter for convergence criteria.
-
-            See parameter `convergence_criteria` for exact meaning
-        :param loss_window_size: specifies `N` in `convergence_criteria`.
-        :param train_op: uses this training operation if specified
-        """
-        # Set default values:
-        if stopping_criteria is None:
-            if convergence_criteria == "step":
-                stopping_criteria = 100
-
-        if train_op is None:
-            train_op = self.model.train_op
-
-        # Initialize:
-        if pkg_constants.EVAL_ON_BATCHED and is_batched:
-            _, _ = self.session.run(
-                (self.model.batched_data_model.eval_set,
-                 self.model.model_vars.convergence_update),
-                feed_dict={self.model.model_vars.convergence_status:
-                               np.repeat(False, repeats=self.model.model_vars.converged.shape[0])
-                           }
-            )
-            ll_current = self.session.run(self.model.batched_data_model.norm_neg_log_likelihood)
-        else:
-            # Have to use eval1 here so that correct object is pulled in trust region.
-            _, _ = self.session.run(
-                (self.model.full_data_model.eval1_set,
-                 self.model.model_vars.convergence_update),
-                feed_dict={self.model.model_vars.convergence_status:
-                               np.repeat(False, repeats=self.model.model_vars.converged.shape[0])
-                           }
-            )
-            ll_current = self.session.run(self.model.full_data_model.norm_neg_log_likelihood_eval1)
-
-        logging.getLogger("batchglm").info(
-            "Step: 0 loss: %f models converged 0",
-            np.sum(ll_current)
-        )
-
-        # Set all to convergence status to False, this is need if multiple training strategies are run:
-        converged_current = np.repeat(False, repeats=self.model.model_vars.converged.shape[0])
-        train_step = 0
-
-        def convergence_decision(convergence_status, step_counter):
-            if convergence_criteria == "step":
-                return np.any(np.logical_not(convergence_status)) and step_counter < stopping_criteria
-            elif convergence_criteria == "all_converged":
-                return np.any(np.logical_not(convergence_status))
-            else:
-                raise ValueError("convergence_criteria %s not recognized." % convergence_criteria)
-
-        while convergence_decision(converged_current, train_step):
-            t0 = time.time()
-            converged_prev = converged_current.copy()
-            ll_prev = ll_current.copy()
-
-            ## Run update.
-            t_a = time.time()
-            if is_batched:
-                _ = self.session.run(self.model.batched_data_model.train_set)
-            else:
-                _ = self.session.run(self.model.full_data_model.train_set)
-
-            if trustregion_mode:
-                t_b = time.time()
-                _, x_step = self.session.run(
-                    (train_op["train"]["trial_op"],
-                     train_op["update"]),
-                    feed_dict=feed_dict
-                )
-                t_c = time.time()
-                _ = self.session.run(self.model.full_data_model.eval0_set)
-                t_d = time.time()
-                train_step, _, features_updated = self.session.run(
-                    (self.model.global_step,
-                     train_op["train"]["update_op"],
-                     self.model.model_vars.updated),
-                    feed_dict=feed_dict
-                )
-                t_e = time.time()
-            else:
-                t_b = time.time()
-                train_step, _, x_step, features_updated = self.session.run(
-                    (self.model.global_step,
-                     train_op["train"],
-                     train_op["update"],
-                     self.model.model_vars.updated),
-                    feed_dict=feed_dict
-                )
-                t_c = time.time()
-
-            if pkg_constants.EVAL_ON_BATCHED and is_batched:
-                _ = self.session.run(self.model.batched_data_model.eval_set)
-                ll_current, jac_train = self.session.run(
-                    (self.model.batched_data_model.norm_neg_log_likelihood,
-                     self.model.batched_data_model.neg_jac_train_eval)
-                )
-            else:
-                _ = self.session.run(self.model.full_data_model.eval1_set)
-                ll_current, jac_train = self.session.run(
-                    (self.model.full_data_model.norm_neg_log_likelihood_eval1,
-                     self.model.full_data_model.neg_jac_train_eval)
-                )
-            t_f = time.time()
-
-            if trustregion_mode:
-                logging.getLogger("batchglm").debug(
-                    "### run time break-down: reduce op. %s, trial %s, ll %s, update %s, eval %s",
-                    str(np.round(t_b - t_a, 3)),
-                    str(np.round(t_c - t_b, 3)),
-                    str(np.round(t_d - t_c, 3)),
-                    str(np.round(t_e - t_d, 3)),
-                    str(np.round(t_f - t_e, 3))
-                )
-            else:
-                logging.getLogger("batchglm").debug(
-                    "### run time break-down: reduce op. %s, update %s, eval %s",
-                    str(np.round(t_b - t_a, 3)),
-                    str(np.round(t_c - t_b, 3)),
-                    str(np.round(t_f - t_c, 3))
-                )
-
-            if len(self.model.full_data_model.idx_train_loc) > 0:
-                x_norm_loc = np.sqrt(np.sum(np.square(
-                    np.abs(x_step[self.model.model_vars.idx_train_loc, :])
-                ), axis=0))
-            else:
-                x_norm_loc = np.zeros([self.model.model_vars.n_features])
-
-            if len(self.model.full_data_model.idx_train_scale) > 0:
-                x_norm_scale = np.sqrt(np.sum(np.square(
-                    np.abs(x_step[self.model.model_vars.idx_train_scale, :])
-                ), axis=0))
-            else:
-                x_norm_scale = np.zeros([self.model.model_vars.n_features])
-
-            # Update convergence status of non-converged features:
-            # Cost function value improvement:
-            ll_converged = (ll_prev - ll_current) / ll_prev < pkg_constants.LLTOL_BY_FEATURE
-            if not pkg_constants.EVAL_ON_BATCHED or not is_batched:
-                if np.any(ll_current > ll_prev + 1e-12):
-                    logging.getLogger("batchglm").warning("bad update found: %i bad updates" % np.sum(ll_current > ll_prev + 1e-12))
-
-            converged_current = np.logical_or(
-                converged_prev,
-                np.logical_and(ll_converged, features_updated)
-            )
-            converged_f = np.logical_and(
-                np.logical_not(converged_prev),
-                np.logical_and(ll_converged, features_updated)
-            )
-            # Gradient norm:
-            if pkg_constants.EVAL_ON_BATCHED and is_batched:
-                jac_normalization = self.model.batch_size
-            else:
-                jac_normalization = self.model.num_observations
-
-            if len(self.model.full_data_model.idx_train_loc) > 0:
-                idx_jac_loc = np.array([list(self.model.full_data_model.idx_train).index(x)
-                                        for x in self.model.full_data_model.idx_train_loc])
-                grad_norm_loc = np.sum(np.abs(jac_train[:, idx_jac_loc]), axis=1) / jac_normalization
-            else:
-                grad_norm_loc = np.zeros([self.model.model_vars.n_features])
-            if len(self.model.full_data_model.idx_train_scale) > 0:
-                idx_jac_scale = np.array([list(self.model.full_data_model.idx_train).index(x)
-                                          for x in self.model.full_data_model.idx_train_scale])
-                grad_norm_scale = np.sum(np.abs(jac_train[:, idx_jac_scale]), axis=1) / jac_normalization
-            else:
-                grad_norm_scale = np.zeros([self.model.model_vars.n_features])
-            converged_g = np.logical_and(
-                np.logical_not(converged_prev),
-                np.logical_and(
-                    grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC,
-                    grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE
-                )
-            )
-            converged_current = np.logical_or(
-                converged_current,
-                np.logical_and(
-                    grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC,
-                    grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE
-                )
-            )
-            # Step length:
-            converged_x = np.logical_and(
-                np.logical_not(converged_prev),
-                np.logical_and(
-                    x_norm_loc < pkg_constants.XTOL_BY_FEATURE_LOC,
-                    x_norm_scale < pkg_constants.XTOL_BY_FEATURE_SCALE
-                )
-            )
-            converged_current = np.logical_or(
-                converged_current,
-                np.logical_and(
-                    x_norm_loc < pkg_constants.XTOL_BY_FEATURE_LOC,
-                    x_norm_scale < pkg_constants.XTOL_BY_FEATURE_SCALE
-                )
-            )
-            t1 = time.time()
-
-            self.session.run((self.model.model_vars.convergence_update), feed_dict={
-                self.model.model_vars.convergence_status: converged_current
-            })
-            logging.getLogger("batchglm").info(
-                "Step: %d loss: %f, converged %i in %s sec., updated %i, {f: %i, g: %i, x: %i}",
-                train_step,
-                np.sum(ll_current),
-                np.sum(converged_current).astype("int32"),
-                str(np.round(t1 - t0, 3)),
-                np.sum(np.logical_and(np.logical_not(converged_prev), features_updated)).astype("int32"),
-                np.sum(converged_f), np.sum(converged_g), np.sum(converged_x)
-            )
diff --git a/batchglm/train/tf1/base/estimator_graph.py b/batchglm/train/tf1/base/estimator_graph.py
deleted file mode 100644
index 2b420809..00000000
--- a/batchglm/train/tf1/base/estimator_graph.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import abc
-import tensorflow as tf
-
-
-class TFEstimatorGraph(metaclass=abc.ABCMeta):
-    graph: tf.Graph
-    loss: tf.Tensor
-    init_op: tf.Tensor
-    train_op: tf.Tensor
-    global_step: tf.Tensor
-
-    def __init__(self, graph=None):
-        if graph is None:
-            graph = tf.Graph()
-        self.graph = graph
diff --git a/batchglm/train/tf1/base/external.py b/batchglm/train/tf1/base/external.py
deleted file mode 100644
index 5dd321e1..00000000
--- a/batchglm/train/tf1/base/external.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from batchglm.models.base import _EstimatorBase
-from batchglm import pkg_constants
diff --git a/batchglm/train/tf1/base/model.py b/batchglm/train/tf1/base/model.py
deleted file mode 100644
index ebfec6e0..00000000
--- a/batchglm/train/tf1/base/model.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import abc
-import logging
-
-import tensorflow as tf
-import numpy as np
-
-logger = logging.getLogger(__name__)
-
-
-class ProcessModelBase:
-
-    @abc.abstractmethod
-    def param_bounds(self, dtype):
-        pass
-
-    def tf_clip_param(
-            self,
-            param,
-            name
-    ):
-        bounds_min, bounds_max = self.param_bounds(param.dtype)
-        return tf.clip_by_value(
-            param,
-            bounds_min[name],
-            bounds_max[name]
-        )
-
-    def np_clip_param(
-            self,
-            param,
-            name
-    ):
-        bounds_min, bounds_max = self.param_bounds(param.dtype)
-        return np.clip(
-            param,
-            bounds_min[name],
-            bounds_max[name],
-            # out=param
-        )
diff --git a/batchglm/train/tf1/base_glm/README.md b/batchglm/train/tf1/base_glm/README.md
deleted file mode 100644
index eea79ccc..00000000
--- a/batchglm/train/tf1/base_glm/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# Classes with GLM specific code.
-All noise models that are in the GLM category inherit all of these classes.
\ No newline at end of file
diff --git a/batchglm/train/tf1/base_glm/__init__.py b/batchglm/train/tf1/base_glm/__init__.py
deleted file mode 100644
index c77b285b..00000000
--- a/batchglm/train/tf1/base_glm/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .estimator_graph import GradientGraphGLM, NewtonGraphGLM, TrainerGraphGLM, EstimatorGraphGLM, FullDataModelGraphGLM, BatchedDataModelGraphGLM
-from .hessians import HessiansGLM
-from .fim import FIMGLM
-from .jacobians import JacobiansGLM
-from .model import ProcessModelGLM, ModelVarsGLM, BasicModelGraphGLM
-from .reducible_tensors import ReducableTensorsGLM
diff --git a/batchglm/train/tf1/base_glm/estimator_graph.py b/batchglm/train/tf1/base_glm/estimator_graph.py
deleted file mode 100644
index 543c6468..00000000
--- a/batchglm/train/tf1/base_glm/estimator_graph.py
+++ /dev/null
@@ -1,1394 +0,0 @@
-import abc
-import logging
-from typing import Union
-
-import numpy as np
-import tensorflow as tf
-
-try:
-    import anndata
-except ImportError:
-    anndata = None
-
-from .model import ModelVarsGLM
-from .fim import FIMGLM
-from .hessians import HessiansGLM
-from .jacobians import JacobiansGLM
-from .external import TFEstimatorGraph
-from .external import train_utils
-from .external import pkg_constants
-
-logger = logging.getLogger(__name__)
-
-
-class FullDataModelGraphGLM:
-    """
-    Computational graph to evaluate model on full data set.
-
-    Here, we assume that the model cannot be executed on the full data set
-    for memory reasons and therefore divide the data set into batches,
-    execute the model on these batches and summarise the resulting metrics
-    across batches. FullDataModelGraph is therefore an extension of
-    BasicModelGraph that distributes operations across batches of observations.
-
-    The distribution is performed by the function map_model().
-    The model metrics which can be collected are:
-
-        - The model likelihood (cost function value).
-        - Model Jacobian matrix for trainer parameters (for training).
-        - Model Jacobian matrix for all parameters (for downstream usage,
-        e.g. hypothesis tests which can also be performed on closed form MLEs).
-        - Model Hessian matrix for trainer parameters (for training).
-        - Model Hessian matrix for all parameters (for downstream usage,
-        e.g. hypothesis tests which can also be performed on closed form MLEs).
-    """
-    log_likelihood: tf.Tensor
-    norm_log_likelihood: tf.Tensor
-    norm_neg_log_likelihood: tf.Tensor
-    loss: tf.Tensor
-
-    jac: JacobiansGLM
-    neg_jac_train: tf.Tensor
-
-    hessians: HessiansGLM
-    neg_hessians_train: tf.Tensor
-
-    fim: FIMGLM
-    fim_train: tf.Tensor
-
-    noise_model: str
-
-
-class BatchedDataModelGraphGLM:
-    """
-    Computational graph to evaluate model on batches of data set.
-
-    The model metrics of a batch which can be collected are:
-
-        - The model likelihood (cost function value).
-        - Model Jacobian matrix for trained parameters (for training).
-        - Model Hessian matrix for trained parameters (for training).
-        - Model Fisher information matrix for trained parameters (for training).
-    """
-    log_likelihood: tf.Tensor
-    norm_log_likelihood: tf.Tensor
-    norm_neg_log_likelihood: tf.Tensor
-    loss: tf.Tensor
-
-    neg_jac_train: tf.Tensor
-    neg_hessians_train: tf.Tensor
-    fim_train: tf.Tensor
-
-    noise_model: str
-
-
-class GradientGraphGLM:
-    """
-
-    Define newton-rhapson updates and gradients depending on whether data is batched.
-    The has to be distinguished as there are different jacobians
-    and hessians for the full and the batched data.
-    """
-    model_vars: ModelVarsGLM
-    full_data_model: FullDataModelGraphGLM
-    batched_data_model: BatchedDataModelGraphGLM
-
-    def __init__(
-            self,
-            model_vars: ModelVarsGLM,
-            full_data_model: FullDataModelGraphGLM,
-            batched_data_model: BatchedDataModelGraphGLM,
-            train_loc,
-            train_scale
-    ):
-        self.gradients_full_raw = None
-        self.gradients_batch_raw = None
-        self.model_vars = model_vars
-        self.full_data_model = full_data_model
-        self.batched_data_model = batched_data_model
-
-        if train_loc or train_scale:
-            self.gradients_full()
-            if self.batched_data_model is not None:
-                self.gradients_batched()
-
-            # Pad gradients to receive update tensors that match
-            # the shape of model_vars.params.
-            if train_loc:
-                if train_scale:
-                    if self.batched_data_model is not None:
-                        gradients_batch = self.gradients_batch_raw
-                    gradients_full = self.gradients_full_raw
-                else:
-                    if self.batched_data_model is not None:
-                        gradients_batch = tf.concat([
-                            self.gradients_batch_raw,
-                            tf.zeros_like(self.model_vars.b_var)
-                        ], axis=0)
-                    gradients_full = tf.concat([
-                        self.gradients_full_raw,
-                        tf.zeros_like(self.model_vars.b_var)
-                    ], axis=0)
-            else:
-                if self.batched_data_model is not None:
-                    gradients_batch = tf.concat([
-                        tf.zeros_like(self.model_vars.a_var),
-                        self.gradients_batch_raw
-                    ], axis=0)
-                gradients_full = tf.concat([
-                    tf.zeros_like(self.model_vars.a_var),
-                    self.gradients_full_raw
-                ], axis=0)
-        else:
-            # These gradients are returned for convergence evaluation.
-            # In this case, closed form estimates were used, one could
-            # still evaluate the gradients here but we do not do
-            # this to speed up run time.
-            if self.batched_data_model is not None:
-                gradients_batch = tf.zeros_like(self.model_vars.params)
-            gradients_full = tf.zeros_like(self.model_vars.params)
-
-        # Save attributes necessary for reinitialization:
-        self.train_loc = train_loc
-        self.train_scale = train_scale
-
-        self.gradients_full = gradients_full
-        if self.batched_data_model is not None:
-            self.gradients_batch = gradients_batch
-        else:
-            self.gradients_batch = None
-
-    def gradients_full(self):
-        gradients_full = tf.transpose(self.full_data_model.neg_jac_train)
-        self.gradients_full_raw = gradients_full
-
-    def gradients_batched(self):
-        gradients_batch = tf.transpose(self.batched_data_model.neg_jac_train)
-        self.gradients_batch_raw = gradients_batch
-
-
-class NewtonGraphGLM:
-    """
-    Define update vectors which require a matrix inversion: Newton-Raphson and
-    IRLS updates.
-
-    Define newton-type updates and gradients depending on whether data is batched.
-    This has to be distinguished as there are different jacobians
-    and hessians for the full and the batched data.
-    """
-    model_vars: tf.Tensor
-    full_data_model: FullDataModelGraphGLM
-    batched_data_model: BatchedDataModelGraphGLM
-
-    nr_update_full: Union[tf.Tensor, None]
-    nr_update_batched: Union[tf.Tensor, None]
-    nr_tr_update_full: Union[tf.Tensor, None]
-    nr_tr_update_batched: Union[tf.Tensor, None]
-
-    irls_update_full: Union[tf.Tensor, None]
-    irls_update_batched: Union[tf.Tensor, None]
-    irls_tr_update_full: Union[tf.Tensor, None]
-    irls_tr_update_batched: Union[tf.Tensor, None]
-
-    nr_tr_radius: Union[tf.Variable, None]
-    nr_tr_pred_cost_gain_full: Union[tf.Tensor, None]
-    nr_tr_pred_cost_gain_batched: Union[tf.Tensor, None]
-
-    irls_tr_radius: Union[tf.Variable, None]
-    irls_tr_pred_cost_gain_full: Union[tf.Tensor, None]
-    irls_tr_pred_cost_gain_batched: Union[tf.Tensor, None]
-
-    def __init__(
-            self,
-            provide_optimizers,
-            train_mu,
-            train_r,
-            dtype
-    ):
-        if train_mu or train_r:
-            if provide_optimizers["nr"] or provide_optimizers["nr_tr"]:
-                if self.batched_data_model is None:
-                    batched_lhs = None
-                    batched_rhs = None
-                else:
-                    batched_lhs = self.batched_data_model.neg_hessians_train
-                    batched_rhs = self.batched_data_model.neg_jac_train
-
-                nr_update_full_raw, nr_update_batched_raw = self.build_updates_nr(
-                    full_lhs=self.full_data_model.neg_hessians_train,
-                    batched_lhs=batched_lhs,
-                    full_rhs=self.full_data_model.neg_jac_train,
-                    batched_rhs=batched_rhs,
-                    psd=False
-                )
-                nr_update_full, nr_update_batched = self.pad_updates(
-                    train_mu=train_mu,
-                    train_r=train_r,
-                    update_full_raw=nr_update_full_raw,
-                    update_batched_raw=nr_update_batched_raw
-                )
-
-                self.nr_tr_x_step_full = tf.Variable(tf.zeros_like(nr_update_full))
-                if self.batched_data_model is None:
-                    self.nr_tr_x_step_batched = None
-                else:
-                    self.nr_tr_x_step_batched = tf.Variable(tf.zeros_like(nr_update_batched))
-            else:
-                nr_update_full = None
-                nr_update_batched = None
-
-            if provide_optimizers["nr_tr"]:
-                self.nr_tr_radius = tf.Variable(
-                    np.zeros(shape=[self.model_vars.n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
-                    dtype=dtype
-                )
-                self.nr_tr_ll_prev_full = tf.Variable(np.zeros(shape=[self.model_vars.n_features]))
-                self.nr_tr_pred_gain_full = tf.Variable(np.zeros(shape=[self.model_vars.n_features]))
-
-                if self.batched_data_model is None:
-                    self.nr_tr_ll_prev_batched = None
-                    self.nr_tr_pred_gain_batched = None
-                else:
-                    self.nr_tr_ll_prev_batched = tf.Variable(np.zeros(shape=[self.model_vars.n_features]))
-                    self.nr_tr_pred_gain_batched = tf.Variable(np.zeros(shape=[self.model_vars.n_features]))
-
-                n_obs = tf.cast(self.full_data_model.num_observations, dtype=dtype)
-
-                nr_tr_proposed_vector_full = self.trust_region_newton_update(
-                    update_raw=nr_update_full_raw,
-                    radius_container=self.nr_tr_radius,
-                    n_obs=self.num_observations_tf
-                )
-                nr_tr_pred_cost_gain_full = self.trust_region_newton_cost_gain(
-                    proposed_vector=nr_tr_proposed_vector_full,
-                    neg_jac=self.full_data_model.neg_jac_train,
-                    hessian_fim=self.full_data_model.neg_hessians_train,
-                    n_obs=self.num_observations_tf
-                )
-
-                if self.batched_data_model is not None:
-                    nr_tr_proposed_vector_batched = self.trust_region_newton_update(
-                        update_raw=nr_update_batched_raw,
-                        radius_container=self.nr_tr_radius,
-                        n_obs=self.batch_size_tf
-                    )
-                    nr_tr_pred_cost_gain_batched = self.trust_region_newton_cost_gain(
-                        proposed_vector=nr_tr_proposed_vector_full,
-                        neg_jac=self.batched_data_model.neg_jac_train,
-                        hessian_fim=self.batched_data_model.neg_hessians_train,
-                        n_obs=self.batch_size_tf
-                    )
-                else:
-                    nr_tr_pred_cost_gain_batched = None
-                    nr_tr_proposed_vector_batched = None
-
-                nr_tr_proposed_vector_full_pad, nr_tr_proposed_vector_batched_pad = self.pad_updates(
-                    train_mu=train_mu,
-                    train_r=train_r,
-                    update_full_raw=nr_tr_proposed_vector_full,
-                    update_batched_raw=nr_tr_proposed_vector_batched
-                )
-
-                train_ops_nr_tr_full = self.trust_region_ops(
-                    likelihood_container=self.nr_tr_ll_prev_full,
-                    proposed_vector=nr_tr_proposed_vector_full_pad,
-                    proposed_vector_container=self.nr_tr_x_step_full,
-                    proposed_gain=nr_tr_pred_cost_gain_full,
-                    proposed_gain_container=self.nr_tr_pred_gain_full,
-                    radius_container=self.nr_tr_radius,
-                    dtype=dtype
-                )
-                if self.batched_data_model is not None:
-                    train_ops_nr_tr_batched = self.trust_region_ops(
-                        likelihood_container=self.nr_tr_ll_prev_batched,
-                        proposed_vector=nr_tr_proposed_vector_batched_pad,
-                        proposed_vector_container=self.nr_tr_x_step_batched,
-                        proposed_gain=nr_tr_pred_cost_gain_batched,
-                        proposed_gain_container=self.nr_tr_pred_gain_batched,
-                        radius_container=self.nr_tr_radius,
-                        dtype=dtype
-                    )
-                else:
-                    train_ops_nr_tr_batched = None
-            else:
-                train_ops_nr_tr_full = None
-                train_ops_nr_tr_batched = None
-                self.nr_tr_radius = tf.Variable(np.array([np.inf]), dtype=dtype)
-
-            if provide_optimizers["irls"] or provide_optimizers["irls_tr"] or \
-                    provide_optimizers["irls_gd"] or provide_optimizers["irls_gd_tr"]:
-                # Compute a and b model updates separately.
-                if train_mu:
-                    # The FIM of the mean model is guaranteed to be
-                    # positive semi-definite and can therefore be inverted
-                    # with the Cholesky decomposition. This information is
-                    # passed here with psd=True.
-                    if self.batched_data_model is None:
-                        batched_lhs = None
-                        batched_rhs = None
-                    else:
-                        batched_lhs = self.batched_data_model.fim_a
-                        batched_rhs = self.batched_data_model.neg_jac_a
-
-                    irls_update_a_full, irls_update_a_batched = self.build_updates_nr(
-                        full_lhs=self.full_data_model.fim_a,
-                        batched_lhs=batched_lhs,
-                        full_rhs=self.full_data_model.neg_jac_a,
-                        batched_rhs=batched_rhs,
-                        psd=True
-                    )
-                else:
-                    irls_update_a_full = None
-                    irls_update_a_batched = None
-
-                if train_r:
-                    if self.batched_data_model is None:
-                        batched_lhs = None
-                        batched_rhs = None
-                    else:
-                        batched_lhs = self.batched_data_model.fim_b
-                        batched_rhs = self.batched_data_model.neg_jac_b
-                    if provide_optimizers["irls"] or provide_optimizers["irls_tr"]:
-                        irls_update_b_full, irls_update_b_batched = self.build_updates_nr(
-                            full_lhs=self.full_data_model.fim_b,
-                            batched_lhs=batched_lhs,
-                            full_rhs=self.full_data_model.neg_jac_b,
-                            batched_rhs=batched_rhs,
-                            psd=False
-                        )
-                    else:
-                        irls_update_b_full = None
-                        irls_update_b_batched = None
-                    if provide_optimizers["irls_gd"] or provide_optimizers["irls_gd_tr"]:
-                        if self.batched_data_model is not None:
-                            batched_jac = self.batched_data_model.neg_jac_b
-                        else:
-                            batched_jac = None
-                        irls_gd_update_b_full, irls_gd_update_b_batched = self.build_updates_gd(
-                            full_jac=self.full_data_model.neg_jac_b,
-                            batched_jac=batched_jac,
-                        )
-                    else:
-                        irls_gd_update_b_full = None
-                        irls_gd_update_b_batched = None
-                else:
-                    irls_update_b_full = None
-                    irls_update_b_batched = None
-                    irls_gd_update_b_full = None
-                    irls_gd_update_b_batched = None
-
-            if provide_optimizers["irls"]:
-                if train_mu and train_r:
-                    irls_update_full_raw = tf.concat([irls_update_a_full, irls_update_b_full], axis=0)
-                    if self.batched_data_model is not None:
-                        irls_update_batched_raw = tf.concat([irls_update_a_batched, irls_update_b_batched], axis=0)
-                    else:
-                        irls_update_batched_raw = None
-                elif train_mu:
-                    irls_update_full_raw = irls_update_a_full
-                    if self.batched_data_model is not None:
-                        irls_update_batched_raw = irls_update_a_batched
-                    else:
-                        irls_update_batched_raw = None
-                elif train_r:
-                    irls_update_full_raw = irls_update_b_full
-                    if self.batched_data_model is not None:
-                        irls_update_batched_raw = irls_update_b_batched
-                    else:
-                        irls_update_batched_raw = None
-                else:
-                    irls_update_full_raw = None
-                    if self.batched_data_model is not None:
-                        irls_update_batched_raw = None
-                    else:
-                        irls_update_batched_raw = None
-
-                irls_update_full, irls_update_batched = self.pad_updates(
-                    train_mu=train_mu,
-                    train_r=train_r,
-                    update_full_raw=irls_update_full_raw,
-                    update_batched_raw=irls_update_batched_raw
-                )
-
-                self.irls_tr_x_step_full = tf.Variable(tf.zeros_like(irls_update_full))
-                if self.batched_data_model is None:
-                    self.irls_tr_x_step_batched = None
-                else:
-                    self.irls_tr_x_step_batched = tf.Variable(tf.zeros_like(irls_update_full))
-            else:
-                irls_update_full = None
-                irls_update_batched = None
-
-            if provide_optimizers["irls_gd"]:
-                if train_mu and train_r:
-                    irls_gd_update_full_raw = tf.concat([irls_update_a_full, irls_gd_update_b_full], axis=0)
-                    if self.batched_data_model is not None:
-                        irls_gd_update_batched_raw = tf.concat([irls_update_a_batched, irls_gd_update_b_batched], axis=0)
-                    else:
-                        irls_gd_update_batched_raw = None
-                elif train_mu:
-                    irls_gd_update_full_raw = irls_update_a_full
-                    if self.batched_data_model is not None:
-                        irls_gd_update_batched_raw = irls_update_a_batched
-                    else:
-                        irls_gd_update_batched_raw = None
-                elif train_r:
-                    irls_gd_update_full_raw = irls_gd_update_b_full
-                    if self.batched_data_model is not None:
-                        irls_gd_update_batched_raw = irls_gd_update_b_batched
-                    else:
-                        irls_gd_update_batched_raw = None
-                else:
-                    irls_gd_update_full_raw = None
-                    if self.batched_data_model is not None:
-                        irls_gd_update_batched_raw = None
-                    else:
-                        irls_gd_update_batched_raw = None
-
-                irls_gd_update_full, irls_gd_update_batched = self.pad_updates(
-                    train_mu=train_mu,
-                    train_r=train_r,
-                    update_full_raw=irls_gd_update_full_raw,
-                    update_batched_raw=irls_gd_update_batched_raw
-                )
-
-                self.irls_gd_tr_x_step_full = tf.Variable(tf.zeros_like(irls_gd_update_full))
-                if self.batched_data_model is None:
-                    self.irls_gd_tr_x_step_batched = None
-                else:
-                    self.irls_gd_tr_x_step_batched = tf.Variable(tf.zeros_like(irls_gd_update_batched))
-            else:
-                irls_gd_update_full = None
-                irls_gd_update_batched = None
-
-            if provide_optimizers["irls_tr"] or provide_optimizers["irls_gd_tr"]:
-                self.irls_tr_radius = tf.Variable(
-                    np.zeros(shape=[self.model_vars.n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
-                    dtype=dtype
-                )
-                self.irls_tr_ll_prev_full = tf.Variable(np.zeros(shape=[self.model_vars.n_features]))
-                self.irls_tr_pred_gain_full = tf.Variable(np.zeros(shape=[self.model_vars.n_features]))
-
-                if self.batched_data_model is None:
-                    self.irls_tr_ll_prev_batched = None
-                    self.irls_tr_pred_gain_batched = None
-                else:
-                    self.irls_tr_ll_prev_batched = tf.Variable(np.zeros(shape=[self.model_vars.n_features]))
-                    self.irls_tr_pred_gain_batched = tf.Variable(np.zeros(shape=[self.model_vars.n_features]))
-
-                if train_mu:
-                    irls_tr_proposed_vector_full_a = self.trust_region_newton_update(
-                        update_raw=irls_update_a_full,
-                        radius_container=self.irls_tr_radius,
-                        n_obs=self.num_observations_tf
-                    )
-                    irls_tr_pred_cost_gain_full_a = self.trust_region_newton_cost_gain(
-                        proposed_vector=irls_tr_proposed_vector_full_a,
-                        neg_jac=self.full_data_model.neg_jac_a,
-                        hessian_fim=self.full_data_model.fim_a,
-                        n_obs=self.num_observations_tf
-                    )
-                else:
-                    irls_tr_proposed_vector_full_a = None
-                    irls_tr_pred_cost_gain_full_a = None
-
-                if train_r:
-                    if provide_optimizers["irls_tr"]:
-                        irls_tr_proposed_vector_full_b = self.trust_region_newton_update(
-                            update_raw=irls_update_b_full,
-                            radius_container=self.irls_tr_radius,
-                            n_obs=self.num_observations_tf
-                        )
-                        irls_tr_pred_cost_gain_full_b = self.trust_region_newton_cost_gain(
-                            proposed_vector=irls_tr_proposed_vector_full_b,
-                            neg_jac=self.full_data_model.neg_jac_b,
-                            hessian_fim=self.full_data_model.fim_b,
-                            n_obs=self.num_observations_tf
-                        )
-                    else:
-                        irls_tr_proposed_vector_full_b = None
-                        irls_tr_pred_cost_gain_full_b = None
-
-                    if provide_optimizers["irls_gd_tr"]:
-                        irls_gd_tr_proposed_vector_full_b = self.trust_region_linear_update(
-                            update_raw=irls_gd_update_b_full,
-                            radius_container=self.irls_tr_radius,
-                            n_obs=self.num_observations_tf
-                        )
-                        irls_gd_tr_pred_cost_gain_full_b = self.trust_region_linear_cost_gain(
-                            proposed_vector=irls_gd_tr_proposed_vector_full_b,
-                            neg_jac=self.full_data_model.neg_jac_b,
-                            n_obs=self.num_observations_tf
-                        )
-                    else:
-                        irls_gd_tr_proposed_vector_full_b = None
-                        irls_gd_tr_pred_cost_gain_full_b = None
-                else:
-                    irls_tr_proposed_vector_full_b = None
-                    irls_tr_pred_cost_gain_full_b = None
-                    irls_gd_tr_proposed_vector_full_b = None
-                    irls_gd_tr_pred_cost_gain_full_b = None
-
-                if self.batched_data_model is not None:
-                    if train_mu:
-                        irls_tr_proposed_vector_batched_a = self.trust_region_newton_update(
-                            update_raw=irls_update_a_batched,
-                            radius_container=self.irls_tr_radius,
-                            n_obs=self.batch_size_tf
-                        )
-                        irls_tr_pred_cost_gain_batched_a = self.trust_region_newton_cost_gain(
-                            proposed_vector=irls_tr_proposed_vector_batched_a,
-                            neg_jac=self.batched_data_model.neg_jac_a,
-                            hessian_fim=self.batched_data_model.fim_a,
-                            n_obs=self.batch_size_tf
-                        )
-                    else:
-                        irls_tr_proposed_vector_batched_a = None
-                        irls_tr_pred_cost_gain_batched_a = None
-
-                    if train_r:
-                        if provide_optimizers["irls_tr"]:
-                            irls_tr_proposed_vector_batched_b = self.trust_region_newton_update(
-                                update_raw=irls_update_b_batched,
-                                radius_container=self.irls_tr_radius,
-                                n_obs=self.batch_size_tf
-                            )
-                            irls_tr_pred_cost_gain_batched_b = self.trust_region_newton_cost_gain(
-                                proposed_vector=irls_tr_proposed_vector_batched_b,
-                                neg_jac=self.batched_data_model.neg_jac_b,
-                                hessian_fim=self.batched_data_model.fim_b,
-                                n_obs=self.batch_size_tf
-                            )
-                        else:
-                            irls_tr_proposed_vector_batched_b = None
-                            irls_tr_pred_cost_gain_batched_b = None
-
-                        if provide_optimizers["irls_gd_tr"]:
-                            irls_gd_tr_proposed_vector_batched_b = self.trust_region_linear_update(
-                                update_raw=irls_gd_update_b_batched,
-                                radius_container=self.irls_tr_radius,
-                                n_obs=self.batch_size_tf
-                            )
-                            irls_gd_tr_pred_cost_gain_batched_b = self.trust_region_linear_cost_gain(
-                                proposed_vector=irls_gd_tr_proposed_vector_batched_b,
-                                neg_jac=self.batched_data_model.neg_jac_b,
-                                n_obs=self.batch_size_tf
-                            )
-                        else:
-                            irls_gd_tr_proposed_vector_batched_b = None
-                            irls_gd_tr_pred_cost_gain_batched_b = None
-                    else:
-                        irls_tr_proposed_vector_batched_b = None
-                        irls_tr_pred_cost_gain_batched_b = None
-                        irls_gd_tr_proposed_vector_batched_b = None
-                        irls_gd_tr_pred_cost_gain_batched_b = None
-
-                if train_mu and train_r:
-                    if provide_optimizers["irls_tr"]:
-                        irls_tr_update_full_raw = tf.concat([irls_tr_proposed_vector_full_a,
-                                                             irls_tr_proposed_vector_full_b], axis=0)
-                        irls_tr_pred_cost_gain_full = tf.add(irls_tr_pred_cost_gain_full_a,
-                                                             irls_tr_pred_cost_gain_full_b)
-                    else:
-                        irls_tr_update_full_raw = None
-                        irls_tr_pred_cost_gain_full = None
-
-                    if provide_optimizers["irls_gd_tr"]:
-                        irls_gd_tr_update_full_raw = tf.concat([irls_tr_proposed_vector_full_a,
-                                                                irls_gd_tr_proposed_vector_full_b], axis=0)
-                        irls_gd_tr_pred_cost_gain_full = tf.add(irls_tr_pred_cost_gain_full_a,
-                                                                irls_gd_tr_pred_cost_gain_full_b)
-                    else:
-                        irls_gd_tr_update_full_raw = None
-                        irls_gd_tr_pred_cost_gain_full = None
-
-                    if self.batched_data_model is not None:
-                        if provide_optimizers["irls_tr"]:
-                            irls_tr_update_batched_raw = tf.concat([irls_tr_proposed_vector_batched_a,
-                                                                    irls_tr_proposed_vector_batched_b], axis=0)
-                            irls_tr_pred_cost_gain_batched = tf.add(irls_tr_pred_cost_gain_batched_a,
-                                                                    irls_tr_pred_cost_gain_batched_b)
-                        else:
-                            irls_tr_update_batched_raw = None
-                            irls_tr_pred_cost_gain_batched = None
-
-                        if provide_optimizers["irls_gd_tr"]:
-                            irls_gd_tr_update_batched_raw = tf.concat([irls_tr_proposed_vector_batched_a,
-                                                                       irls_gd_tr_proposed_vector_batched_b], axis=0)
-                            irls_gd_tr_pred_cost_gain_batched = tf.add(irls_tr_pred_cost_gain_batched_a,
-                                                                       irls_gd_tr_pred_cost_gain_batched_b)
-                        else:
-                            irls_gd_tr_update_batched_raw = None
-                            irls_gd_tr_pred_cost_gain_batched = None
-                    else:
-                        irls_tr_update_batched_raw = None
-                        irls_gd_tr_update_batched_raw = None
-                        irls_tr_pred_cost_gain_batched = None
-                        irls_gd_tr_pred_cost_gain_batched = None
-                elif train_mu and not train_r:
-                    irls_tr_update_full_raw = irls_tr_proposed_vector_full_a
-                    irls_gd_tr_update_full_raw = irls_tr_proposed_vector_full_a
-                    irls_tr_pred_cost_gain_full = irls_tr_pred_cost_gain_full_a
-                    irls_gd_tr_pred_cost_gain_full = irls_tr_pred_cost_gain_full_a
-                    if self.batched_data_model is not None:
-                        irls_tr_update_batched_raw = irls_tr_proposed_vector_batched_a
-                        irls_gd_tr_update_batched_raw = irls_tr_proposed_vector_batched_a
-                        irls_tr_pred_cost_gain_batched = irls_tr_pred_cost_gain_batched_a
-                        irls_gd_tr_pred_cost_gain_batched = irls_tr_pred_cost_gain_batched_a
-                    else:
-                        irls_tr_update_batched_raw = None
-                        irls_gd_tr_update_batched_raw = None
-                        irls_tr_pred_cost_gain_batched = None
-                        irls_gd_tr_pred_cost_gain_batched = None
-                elif not train_mu and train_r:
-                    if provide_optimizers["irls_tr"]:
-                        irls_tr_update_full_raw = irls_tr_proposed_vector_full_b
-                        irls_tr_pred_cost_gain_full = irls_tr_pred_cost_gain_full_b
-                    else:
-                        irls_tr_update_full_raw = None
-                        irls_tr_pred_cost_gain_full = None
-
-                    if provide_optimizers["irls_gd_tr"]:
-                        irls_gd_tr_update_full_raw = irls_gd_tr_proposed_vector_full_b
-                        irls_gd_tr_pred_cost_gain_full = irls_gd_tr_pred_cost_gain_full_b
-                    else:
-                        irls_gd_tr_update_full_raw = None
-                        irls_gd_tr_pred_cost_gain_full = None
-
-                    if self.batched_data_model is not None:
-                        if provide_optimizers["irls_tr"]:
-                            irls_tr_update_batched_raw = irls_tr_proposed_vector_batched_b
-                            irls_tr_pred_cost_gain_batched = irls_tr_pred_cost_gain_batched_b
-                        else:
-                            irls_tr_update_batched_raw = None
-                            irls_tr_pred_cost_gain_batched = None
-
-                        if provide_optimizers["irls_gd_tr"]:
-                            irls_gd_tr_update_batched_raw = irls_gd_tr_proposed_vector_batched_b
-                            irls_gd_tr_pred_cost_gain_batched = irls_gd_tr_pred_cost_gain_batched_b
-                        else:
-                            irls_gd_tr_update_batched_raw = None
-                            irls_gd_tr_pred_cost_gain_batched = None
-                    else:
-                        irls_tr_update_batched_raw = None
-                        irls_gd_tr_update_batched_raw = None
-                        irls_tr_pred_cost_gain_batched = None
-                        irls_gd_tr_pred_cost_gain_batched = None
-                else:
-                    assert False
-
-                if provide_optimizers["irls_tr"]:
-                    irls_tr_update_full, irls_tr_update_batched = self.pad_updates(
-                        train_mu=train_mu,
-                        train_r=train_r,
-                        update_full_raw=irls_tr_update_full_raw,
-                        update_batched_raw=irls_tr_update_batched_raw
-                    )
-                else:
-                    irls_tr_update_full = None
-                    irls_tr_update_batched = None
-
-                if provide_optimizers["irls_gd_tr"]:
-                    irls_gd_tr_update_full, irls_gd_tr_update_batched = self.pad_updates(
-                        train_mu=train_mu,
-                        train_r=train_r,
-                        update_full_raw=irls_gd_tr_update_full_raw,
-                        update_batched_raw=irls_gd_tr_update_batched_raw
-                    )
-                else:
-                    irls_gd_tr_update_full = None
-                    irls_gd_tr_update_batched = None
-
-                if provide_optimizers["irls_tr"] or provide_optimizers["irls_gd_tr"]:
-                    self.irls_tr_x_step_full = tf.Variable(tf.zeros_like(self.model_vars.params))
-                    if self.batched_data_model is None:
-                        self.irls_tr_x_step_batched = None
-                    else:
-                        self.irls_tr_x_step_batched = tf.Variable(tf.zeros_like(self.model_vars.params))
-                else:
-                    self.irls_tr_x_step_full = None
-                    self.irls_tr_x_step_batched = None
-
-                if provide_optimizers["irls_tr"]:
-                    train_ops_irls_tr_full = self.trust_region_ops(
-                        likelihood_container=self.irls_tr_ll_prev_full,
-                        proposed_vector=irls_tr_update_full,
-                        proposed_vector_container=self.irls_tr_x_step_full,
-                        proposed_gain=irls_tr_pred_cost_gain_full,
-                        proposed_gain_container=self.irls_tr_pred_gain_full,
-                        radius_container=self.irls_tr_radius,
-                        dtype=dtype
-                    )
-                    if self.batched_data_model is not None:
-                        train_ops_irls_tr_batched = self.trust_region_ops(
-                            likelihood_container=self.irls_tr_ll_prev_batched,
-                            proposed_vector=irls_tr_update_batched,
-                            proposed_vector_container=self.irls_tr_x_step_batched,
-                            proposed_gain=irls_tr_pred_cost_gain_batched,
-                            proposed_gain_container=self.irls_tr_pred_gain_batched,
-                            radius_container=self.irls_tr_radius,
-                            dtype=dtype
-                        )
-                    else:
-                        train_ops_irls_tr_batched = None
-                else:
-                    train_ops_irls_tr_full = None
-                    train_ops_irls_tr_batched = None
-
-                if provide_optimizers["irls_gd_tr"]:
-                    train_ops_irls_gd_tr_full = self.trust_region_ops(
-                        likelihood_container=self.irls_tr_ll_prev_full,
-                        proposed_vector=irls_gd_tr_update_full,
-                        proposed_vector_container=self.irls_tr_x_step_full,
-                        proposed_gain=irls_gd_tr_pred_cost_gain_full,
-                        proposed_gain_container=self.irls_tr_pred_gain_full,
-                        radius_container=self.irls_tr_radius,
-                        dtype=dtype
-                    )
-                    if self.batched_data_model is not None:
-                        train_ops_irls_gd_tr_batched = self.trust_region_ops(
-                            likelihood_container=self.irls_tr_ll_prev_batched,
-                            proposed_vector=irls_gd_tr_update_batched,
-                            proposed_vector_container=self.irls_tr_x_step_batched,
-                            proposed_gain=irls_gd_tr_pred_cost_gain_batched,
-                            proposed_gain_container=self.irls_tr_pred_gain_batched,
-                            radius_container=self.irls_tr_radius,
-                            dtype=dtype
-                        )
-                    else:
-                        train_ops_irls_gd_tr_batched = None
-                else:
-                    self.irls_gd_tr_x_step_full = None
-                    self.irls_gd_tr_x_step_batched = None
-                    train_ops_irls_gd_tr_full = None
-                    train_ops_irls_gd_tr_batched = None
-            else:
-                train_ops_irls_tr_full = None
-                train_ops_irls_tr_batched = None
-                train_ops_irls_gd_tr_full = None
-                train_ops_irls_gd_tr_batched = None
-                self.irls_tr_radius = tf.Variable(np.array([np.inf]), dtype=dtype)
-        else:
-            nr_update_full = None
-            nr_update_batched = None
-            train_ops_nr_tr_full = None
-            train_ops_nr_tr_batched = None
-
-            irls_update_full = None
-            irls_update_batched = None
-            irls_gd_update_full = None
-            irls_gd_update_batched = None
-            train_ops_irls_tr_full = None
-            train_ops_irls_tr_batched = None
-            train_ops_irls_gd_tr_full = None
-            train_ops_irls_gd_tr_batched = None
-
-            self.nr_tr_radius = tf.Variable(np.array([np.inf]), dtype=dtype)
-            self.irls_tr_radius = tf.Variable(np.array([np.inf]), dtype=dtype)
-
-        self.nr_update_full = nr_update_full
-        self.nr_update_batched = nr_update_batched
-        self.train_ops_nr_tr_full = train_ops_nr_tr_full
-        self.train_ops_nr_tr_batched = train_ops_nr_tr_batched
-
-        self.irls_update_full = irls_update_full
-        self.irls_update_batched = irls_update_batched
-        self.irls_gd_update_full = irls_gd_update_full
-        self.irls_gd_update_batched = irls_gd_update_batched
-        self.train_ops_irls_tr_full = train_ops_irls_tr_full
-        self.train_ops_irls_tr_batched = train_ops_irls_tr_batched
-        self.train_ops_irls_gd_tr_full = train_ops_irls_gd_tr_full
-        self.train_ops_irls_gd_tr_batched = train_ops_irls_gd_tr_batched
-
-    def build_updates_nr(
-            self,
-            full_lhs,
-            batched_rhs,
-            full_rhs,
-            batched_lhs,
-            psd
-    ):
-        update_full = self.newton_type_update(
-            lhs=full_lhs,
-            rhs=full_rhs,
-            psd=psd
-        )
-        if batched_lhs is not None:
-            update_batched = self.newton_type_update(
-                lhs=batched_lhs,
-                rhs=batched_rhs,
-                psd=psd and pkg_constants.CHOLESKY_LSTSQS_BATCHED  # This can be unstable even for fim_a.
-            )
-        else:
-            update_batched = None
-
-        return update_full, update_batched
-
-    def build_updates_gd(
-            self,
-            full_jac,
-            batched_jac
-    ):
-        update_full = tf.transpose(full_jac)
-        if batched_jac is not None:
-            update_batched = tf.transpose(batched_jac)
-        else:
-            update_batched = None
-
-        return update_full, update_batched
-
-    def pad_updates(
-            self,
-            update_full_raw,
-            update_batched_raw,
-            train_mu,
-            train_r
-    ):
-        # Pad update vectors to receive update tensors that match
-        # the shape of model_vars.params.
-        if train_mu:
-            if train_r:
-                netwon_type_update_full = update_full_raw
-                newton_type_update_batched = update_batched_raw
-            else:
-                netwon_type_update_full = tf.concat([
-                    update_full_raw,
-                    tf.zeros_like(self.model_vars.b_var)
-                ], axis=0)
-                if update_batched_raw is not None:
-                    newton_type_update_batched = tf.concat([
-                        update_batched_raw,
-                        tf.zeros_like(self.model_vars.b_var)
-                    ], axis=0)
-                else:
-                    newton_type_update_batched = None
-        elif train_r:
-            netwon_type_update_full = tf.concat([
-                tf.zeros_like(self.model_vars.a_var),
-                update_full_raw
-            ], axis=0)
-            if update_batched_raw is not None:
-                newton_type_update_batched = tf.concat([
-                    tf.zeros_like(self.model_vars.a_var),
-                    update_batched_raw
-                ], axis=0)
-            else:
-                newton_type_update_batched = None
-        else:
-            raise ValueError("No training necessary")
-
-        return netwon_type_update_full, newton_type_update_batched
-
-    def newton_type_update(
-            self,
-            lhs,
-            rhs,
-            psd
-    ):
-        delta_t = tf.squeeze(tf.linalg.lstsq(
-            lhs,
-            tf.expand_dims(rhs, axis=-1),
-            fast=psd and pkg_constants.CHOLESKY_LSTSQS
-        ), axis=-1)
-        update_tensor = tf.transpose(delta_t)
-
-        return update_tensor
-
-    def trust_region_newton_update(
-            self,
-            update_raw,
-            radius_container,
-            n_obs
-    ):
-        update_magnitude_sq = tf.reduce_sum(tf.square(update_raw), axis=0)
-        update_magnitude = tf.where(
-            condition=update_magnitude_sq > 0,
-            x=tf.sqrt(update_magnitude_sq),
-            y=tf.zeros_like(update_magnitude_sq)
-        )
-        update_magnitude_inv = tf.where(
-            condition=update_magnitude > 0,
-            x=tf.divide(
-                tf.ones_like(update_magnitude),
-                update_magnitude
-            ),
-            y=tf.zeros_like(update_magnitude)
-        )
-        update_norm = tf.multiply(update_raw,update_magnitude_inv)
-        update_scale = tf.minimum(
-            radius_container,
-            update_magnitude
-        )
-        proposed_vector = tf.multiply(
-            update_norm,
-            update_scale
-        )
-
-        return proposed_vector
-
-    def trust_region_linear_update(
-            self,
-            update_raw,
-            radius_container,
-            n_obs
-    ):
-        update_magnitude_sq = tf.reduce_sum(tf.square(update_raw), axis=0)
-        update_magnitude = tf.where(
-            condition=update_magnitude_sq > 0,
-            x=tf.sqrt(update_magnitude_sq),
-            y=tf.zeros_like(update_magnitude_sq)
-        )
-        update_magnitude_inv = tf.where(
-            condition=update_magnitude > 0,
-            x=tf.divide(
-                tf.ones_like(update_magnitude),
-                update_magnitude
-            ),
-            y=tf.zeros_like(update_magnitude)
-        )
-        update_norm = tf.multiply(update_raw,update_magnitude_inv)
-        update_scale = tf.minimum(
-            radius_container,
-            update_magnitude / n_obs  # learning rate = 1
-        )
-        proposed_vector = tf.multiply(
-            update_norm,
-            update_scale
-        )
-
-        return proposed_vector
-
-    def trust_region_newton_cost_gain(
-            self,
-            proposed_vector,
-            neg_jac,
-            hessian_fim,
-            n_obs
-    ):
-        pred_cost_gain = tf.add(
-            tf.einsum(
-                'ni,in->n',
-                neg_jac,
-                proposed_vector
-            ) / n_obs,
-            0.5 * tf.einsum(
-                'nix,xin->n',
-                tf.einsum('inx,nij->njx',
-                          tf.expand_dims(proposed_vector, axis=-1),
-                          hessian_fim),
-                tf.expand_dims(proposed_vector, axis=0)
-            ) / tf.square(n_obs)
-        )
-        return pred_cost_gain
-
-    def trust_region_linear_cost_gain(
-            self,
-            proposed_vector,
-            neg_jac,
-            n_obs
-    ):
-        pred_cost_gain = tf.reduce_sum(tf.multiply(
-            proposed_vector,
-            tf.transpose(neg_jac)
-        ), axis=0)
-        return pred_cost_gain
-
-    def trust_region_ops(
-            self,
-            likelihood_container,
-            proposed_vector,
-            proposed_vector_container,
-            proposed_gain,
-            proposed_gain_container,
-            radius_container,
-            dtype
-    ):
-        # Load hyper-parameters:
-        assert pkg_constants.TRUST_REGION_ETA0 < pkg_constants.TRUST_REGION_ETA1, \
-            "eta0 must be smaller than eta1"
-        assert pkg_constants.TRUST_REGION_ETA1 <= pkg_constants.TRUST_REGION_ETA2, \
-            "eta1 must be smaller than or equal to eta2"
-        assert pkg_constants.TRUST_REGION_T1 <= 1, "t1 must be smaller than 1"
-        assert pkg_constants.TRUST_REGION_T2 >= 1, "t1 must be larger than 1"
-        # Set trust region hyper-parameters
-        eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=dtype)
-        eta1 = tf.constant(pkg_constants.TRUST_REGION_ETA1, dtype=dtype)
-        eta2 = tf.constant(pkg_constants.TRUST_REGION_ETA2, dtype=dtype)
-        t1 = tf.constant(pkg_constants.TRUST_REGION_T1, dtype=dtype)
-        t2 = tf.constant(pkg_constants.TRUST_REGION_T2, dtype=dtype)
-        upper_bound = tf.constant(pkg_constants.TRUST_REGION_UPPER_BOUND, dtype=dtype)
-
-        # Phase I: Perform a trial update.
-        # Propose parameter update:
-        train_op_nr_tr_prev = tf.group(
-            tf.compat.v1.assign(likelihood_container, self.full_data_model.norm_neg_log_likelihood_eval1)
-        )
-        train_op_x_step = tf.group(
-            tf.compat.v1.assign(proposed_vector_container, proposed_vector),
-            tf.compat.v1.assign(proposed_gain_container, proposed_gain)
-        )
-        train_op_trial_update = tf.group(
-            tf.compat.v1.assign(self.model_vars.params, self.model_vars.params - proposed_vector)
-        )
-
-        # Phase II: Evaluate success of trial update and complete update cycle.
-        # Include parameter updates only if update improves cost function:
-        delta_f_actual = likelihood_container - self.full_data_model.norm_neg_log_likelihood_eval0
-        delta_f_ratio = tf.divide(delta_f_actual, proposed_gain_container)
-
-        # Compute parameter updates.
-        update_theta = tf.logical_and(delta_f_actual > eta0, tf.logical_not(self.model_vars.converged))
-        update_theta_numeric = tf.expand_dims(tf.cast(update_theta, dtype), axis=0)
-        keep_theta_numeric = tf.ones_like(update_theta_numeric) - update_theta_numeric
-        theta_new_nr_tr = tf.add(
-            tf.multiply(self.model_vars.params + proposed_vector_container, keep_theta_numeric),  # old values
-            tf.multiply(self.model_vars.params, update_theta_numeric)  # new values
-        )
-
-        train_op_update_params = tf.compat.v1.assign(self.model_vars.params, theta_new_nr_tr)
-        train_op_update_status = tf.compat.v1.assign(self.model_vars.updated, update_theta)
-
-        # Update trusted region accordingly:
-        decrease_radius = tf.logical_or(
-            delta_f_actual <= eta0,
-            tf.logical_and(delta_f_ratio <= eta1, tf.logical_not(self.model_vars.converged))
-        )
-        increase_radius = tf.logical_and(
-            delta_f_actual > eta0,
-            tf.logical_and(delta_f_ratio > eta2, tf.logical_not(self.model_vars.converged))
-        )
-        keep_radius = tf.logical_and(tf.logical_not(decrease_radius),
-                                     tf.logical_not(increase_radius))
-        radius_update = tf.add_n([
-            tf.multiply(t1, tf.cast(decrease_radius, dtype)),
-            tf.multiply(t2, tf.cast(increase_radius, dtype)),
-            tf.multiply(tf.ones_like(t1), tf.cast(keep_radius, dtype))
-        ])
-        radius_new = tf.minimum(tf.multiply(radius_container, radius_update), upper_bound)
-        train_op_update_radius = tf.compat.v1.assign(radius_container, radius_new)
-
-        train_ops = {
-            "update": proposed_vector_container,
-            "trial_op": tf.group(
-                train_op_nr_tr_prev,
-                train_op_x_step,
-                train_op_trial_update
-            ),
-            "update_op": tf.group(
-                train_op_update_params,
-                train_op_update_status,
-                train_op_update_radius
-            )
-        }
-
-        return train_ops
-
-
-class TrainerGraphGLM:
-    """
-
-    """
-    model_vars: ModelVarsGLM
-    model_vars_eval: ModelVarsGLM
-
-    full_data_model: FullDataModelGraphGLM
-    batched_data_model: BatchedDataModelGraphGLM
-
-    gradient_graph: GradientGraphGLM
-    gradients_batch: tf.Tensor
-    gradients_full: tf.Tensor
-
-    nr_update_full: tf.Tensor
-    nr_update_batched: tf.Tensor
-    nr_tr_update_full: tf.Tensor
-    nr_tr_update_batched: tf.Tensor
-    irls_update_full: tf.Tensor
-    irls_update_batched: tf.Tensor
-    irls_tr_update_full: tf.Tensor
-    irls_tr_update_batched: tf.Tensor
-
-    nr_tr_radius: Union[tf.Variable, None]
-    nr_tr_pred_cost_gain_full: Union[tf.Tensor, None]
-    nr_tr_pred_cost_gain_batched: Union[tf.Tensor, None]
-
-    irls_tr_radius: Union[tf.Variable, None]
-    irls_tr_pred_cost_gain_full: Union[tf.Tensor, None]
-    irls_tr_pred_cost_gain_batched: Union[tf.Tensor, None]
-
-    num_observations: int
-    num_features: int
-    num_design_loc_params: int
-    num_design_scale_params: int
-    num_loc_params: int
-    num_scale_params: int
-    batch_size: int
-
-    session: tf.compat.v1.Session
-    graph: tf.Graph
-
-    def __init__(
-            self,
-            provide_optimizers,
-            train_loc,
-            train_scale,
-            dtype
-    ):
-        with tf.name_scope("training_graphs"):
-            global_step = tf.compat.v1.train.get_or_create_global_step()
-
-            if (train_loc or train_scale) and self.batched_data_model is not None:
-                logger.debug(" ** building batched trainers")
-                trainer_batch = train_utils.MultiTrainer(
-                    variables=self.model_vars.params,
-                    gradients=self.gradients_batch,
-                    newton_delta=self.nr_update_batched,
-                    irls_delta=self.irls_update_batched,
-                    irls_gd_delta=self.irls_gd_update_batched,
-                    train_ops_nr_tr=self.train_ops_nr_tr_batched,
-                    train_ops_irls_tr=self.train_ops_irls_tr_batched,
-                    train_ops_irls_gd_tr=self.train_ops_irls_gd_tr_batched,
-                    learning_rate=self.learning_rate,
-                    global_step=global_step,
-                    apply_gradients=lambda grad: tf.where(tf.math.is_nan(grad), tf.zeros_like(grad), grad),
-                    provide_optimizers=provide_optimizers,
-                    name="batch_data_trainers"
-                )
-                batch_gradient = trainer_batch.plain_gradient_by_variable(self.model_vars.params)
-                batch_gradient = tf.reduce_sum(tf.abs(batch_gradient), axis=0)
-            else:
-                trainer_batch = None
-                batch_gradient = None
-
-            if train_loc or train_scale:
-                logger.debug(" ** building full trainers")
-                trainer_full = train_utils.MultiTrainer(
-                    variables=self.model_vars.params,
-                    gradients=self.gradients_full,
-                    newton_delta=self.nr_update_full,
-                    irls_delta=self.irls_update_full,
-                    irls_gd_delta=self.irls_gd_update_full,
-                    train_ops_nr_tr=self.train_ops_nr_tr_full,
-                    train_ops_irls_tr=self.train_ops_irls_tr_full,
-                    train_ops_irls_gd_tr=self.train_ops_irls_gd_tr_full,
-                    learning_rate=self.learning_rate,
-                    global_step=global_step,
-                    apply_gradients=lambda grad: tf.where(tf.math.is_nan(grad), tf.zeros_like(grad), grad),
-                    provide_optimizers=provide_optimizers,
-                    name="full_data_trainers"
-                )
-                full_gradient = trainer_full.plain_gradient_by_variable(self.model_vars.params)
-                full_gradient = tf.reduce_sum(tf.abs(full_gradient), axis=0)
-            else:
-                trainer_full = None
-                full_gradient = None
-
-        # # ### BFGS implementation using SciPy L-BFGS
-        # with tf1.name_scope("bfgs"):
-        #     feature_idx = tf1.placeholder(dtype="int64", shape=())
-        #
-        #     X_s = tf1.gather(X, feature_idx, axis=1)
-        #     a_s = tf1.gather(a, feature_idx, axis=1)
-        #     b_s = tf1.gather(b, feature_idx, axis=1)
-        #
-        #     model = BasicModelGraph(X_s, design_loc, design_scale, a_s, b_s, size_factors=size_factors)
-        #
-        #     trainer = tf1.contrib.opt.ScipyOptimizerInterface(
-        #         model.loss,
-        #         method='L-BFGS-B',
-        #         options={'maxiter': maxiter})
-
-        self.global_step = global_step
-
-        self.trainer_batch = trainer_batch
-        self.gradient = batch_gradient
-
-        self.trainer_full = trainer_full
-        self.full_gradient = full_gradient
-
-        self.train_op = None
-
-    @abc.abstractmethod
-    def param_bounds(self):
-        pass
-
-
-class EstimatorGraphGLM(TFEstimatorGraph, NewtonGraphGLM, TrainerGraphGLM):
-    """
-    The estimator graphs are all graph necessary to perform parameter updates and to
-    summarise a current parameter estimate.
-
-    The estimator graph class is divided into the following major sub graphs:
-
-        - The input pipeline: Feed data for parameter updates.
-        -
-    """
-    X: Union[tf.Tensor, tf.SparseTensor]
-
-    a_var: tf.Tensor
-    b_var: tf.Tensor
-
-    model_vars: ModelVarsGLM
-    model_vars_eval: ModelVarsGLM
-
-    noise_model: str
-
-    def __init__(
-            self,
-            num_observations: int,
-            num_features: int,
-            num_design_loc_params: int,
-            num_design_scale_params: int,
-            num_loc_params: int,
-            num_scale_params: int,
-            graph: tf.Graph,
-            batch_size: int,
-            constraints_loc: np.ndarray,
-            constraints_scale: np.ndarray,
-            dtype: str
-    ):
-        """
-
-        :param num_observations: int
-            Number of observations.
-        :param num_features: int
-            Number of features.
-        :param num_design_loc_params: int
-            Number of parameters per feature in mean model.
-        :param num_design_scale_params: int
-            Number of parameters per feature in scale model.
-        :param graph: tf1.Graph
-        :param constraints_loc: tensor (all parameters x dependent parameters) or None
-            Tensor that encodes how complete parameter set which includes dependent
-            parameters arises from indepedent parameters: all = <constraints, indep>.
-            This tensor describes this relation for the mean model.
-            This form of constraints is used in vector generalized linear models (VGLMs).
-            Assumed to be an identity matrix if None.
-        :param constraints_scale: tensor (all parameters x dependent parameters) or None
-            Tensor that encodes how complete parameter set which includes dependent
-            parameters arises from indepedent parameters: all = <constraints, indep>.
-            This tensor describes this relation for the dispersion model.
-            This form of constraints is used in vector generalized linear models (VGLMs).
-            Assumed to be an identity matrix if None.
-        """
-        TFEstimatorGraph.__init__(
-            self=self,
-            graph=graph
-        )
-
-        self.num_observations = num_observations
-        self.num_observations_tf = tf.cast(num_observations, dtype=dtype)
-        self.num_features = num_features
-        self.num_design_loc_params = num_design_loc_params
-        self.num_design_scale_params = num_design_scale_params
-        self.num_loc_params = num_loc_params
-        self.num_scale_params = num_scale_params
-        self.batch_size = batch_size
-        self.batch_size_tf = tf.cast(batch_size, dtype=dtype)
-
-        self.constraints_loc = self._set_constraints(
-            constraints=constraints_loc,
-            num_design_params=self.num_design_loc_params,
-            dtype=dtype
-        )
-        self.constraints_scale = self._set_constraints(
-            constraints=constraints_scale,
-            num_design_params=self.num_design_scale_params,
-            dtype=dtype
-        )
-
-        self.learning_rate = tf.compat.v1.placeholder(dtype, shape=(), name="learning_rate")
-
-    def _run_trainer_init(
-            self,
-            provide_optimizers,
-            train_loc,
-            train_scale,
-            dtype
-    ):
-        logger.debug(" * building gradient graph")
-        self.gradient_graph = GradientGraphGLM(
-            model_vars=self.model_vars,
-            full_data_model=self.full_data_model,
-            batched_data_model=self.batched_data_model,
-            train_loc=train_loc,
-            train_scale=train_scale
-        )
-        self.gradients_batch = self.gradient_graph.gradients_batch
-        self.gradients_full = self.gradient_graph.gradients_full
-
-        logger.debug(" * building newton-type update graph")
-        NewtonGraphGLM.__init__(
-            self=self,
-            provide_optimizers=provide_optimizers,
-            train_mu=train_loc,
-            train_r=train_scale,
-            dtype=dtype
-        )
-
-        logger.debug(" * building trainers")
-        TrainerGraphGLM.__init__(
-            self=self,
-            provide_optimizers=provide_optimizers,
-            train_loc=train_loc,
-            train_scale=train_scale,
-            dtype=dtype
-        )
-
-        with tf.name_scope("init_op"):
-            self.init_op = tf.compat.v1.global_variables_initializer()
-            self.init_ops = []
-
-    def _set_out_var(
-            self,
-            feature_isnonzero,
-            dtype
-    ):
-        # ### output values:
-        #       override all-zero features with lower bound coefficients
-        with tf.name_scope("output"):
-            logger.debug(" ** Build training graph: output")
-            bounds_min, bounds_max = self.param_bounds(dtype)
-
-            param_nonzero_a_var = tf.broadcast_to(feature_isnonzero, [self.num_loc_params, self.num_features])
-            alt_a = tf.broadcast_to(bounds_min["a_var"], [self.num_loc_params, self.num_features])
-            a_var = tf.where(
-                param_nonzero_a_var,
-                self.model_vars.a_var,
-                alt_a
-            )
-
-            param_nonzero_b_var = tf.broadcast_to(feature_isnonzero, [self.num_scale_params, self.num_features])
-            alt_b = tf.broadcast_to(bounds_min["b_var"], [self.num_scale_params, self.num_features])
-            b_var = tf.where(
-                param_nonzero_b_var,
-                self.model_vars.b_var,
-                alt_b
-            )
-
-        self.a_var = a_var
-        self.b_var = b_var
-
-    def _set_constraints(
-            self,
-            constraints,
-            num_design_params,
-            dtype
-    ):
-        if constraints is None:
-            return None
-            #return tf1.eye(
-            #    num_rows=tf1.constant(num_design_params, shape=(), dtype="int32"),
-            #    dtype=dtype
-            #)
-        else:
-            # Check if identity was supplied:
-            if constraints.shape[0] == constraints.shape[1]:
-                if np.sum(constraints - np.eye(constraints.shape[0], dtype=constraints.dtype)) < 1e-12:
-                    return None
-
-            assert constraints.shape[0] == num_design_params, "constraint dimension mismatch"
-            return tf.cast(constraints, dtype=dtype)
-
-    @abc.abstractmethod
-    def param_bounds(self):
-        pass
diff --git a/batchglm/train/tf1/base_glm/external.py b/batchglm/train/tf1/base_glm/external.py
deleted file mode 100644
index aea90c59..00000000
--- a/batchglm/train/tf1/base_glm/external.py
+++ /dev/null
@@ -1,3 +0,0 @@
-import batchglm.train.tf1.train as train_utils
-from batchglm.train.tf1.base import ProcessModelBase, TFEstimatorGraph
-from batchglm import pkg_constants
diff --git a/batchglm/train/tf1/base_glm/fim.py b/batchglm/train/tf1/base_glm/fim.py
deleted file mode 100644
index 6edbb3af..00000000
--- a/batchglm/train/tf1/base_glm/fim.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import abc
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-class FIMGLM:
-    """
-    Compute expected fisher information matrix (FIM)
-    for iteratively re-weighted least squares (IWLS or IRLS) parameter updates for GLMs.
-    """
-
-    @abc.abstractmethod
-    def fim_a_analytic(
-            self,
-            model
-    ):
-        pass
-
-    @abc.abstractmethod
-    def fim_b_analytic(
-            self,
-            model
-    ):
-        pass
-
-    @abc.abstractmethod
-    def _weight_fim_aa(
-            self,
-            loc,
-            scale
-    ):
-        """
-        Compute for mean model IWLS update for a GLM.
-
-        :param loc: tf1.tensor observations x features
-           Value of mean model by observation and feature.
-        :param scale: tf1.tensor observations x features
-           Value of dispersion model by observation and feature.
-
-        :return tuple of tf1.tensors
-           Constants with respect to coefficient index for
-           Fisher information matrix and score function computation.
-        """
-        pass
-
-    @abc.abstractmethod
-    def _weight_fim_bb(
-            self,
-            loc,
-            scale
-    ):
-        """
-        Compute for dispersion model IWLS update for a GLM.
-
-        :param X: tf1.tensor observations x features
-            Observation by observation and feature.
-        :param loc: tf1.tensor observations x features
-            Value of mean model by observation and feature.
-        :param scale: tf1.tensor observations x features
-            Value of dispersion model by observation and feature.
-
-        :return tuple of tf1.tensors
-            Constants with respect to coefficient index for
-            Fisher information matrix and score function computation.
-        """
-        pass
\ No newline at end of file
diff --git a/batchglm/train/tf1/base_glm/hessians.py b/batchglm/train/tf1/base_glm/hessians.py
deleted file mode 100644
index 60d90707..00000000
--- a/batchglm/train/tf1/base_glm/hessians.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import abc
-import logging
-
-import tensorflow as tf
-
-logger = logging.getLogger(__name__)
-
-class HessiansGLM:
-    """
-    Wrapper to compute the Hessian matrix for a GLM.
-    """
-
-    def hessian_analytic(
-            self,
-            model
-    ) -> tf.Tensor:
-        raise NotImplementedError()
-
-    def hessian_tf(
-            self,
-            model
-    ) -> tf.Tensor:
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def _weight_hessian_aa(
-            self,
-            X,
-            loc,
-            scale
-    ):
-        """
-        Compute the coefficient index invariant part of the
-        mean model block of the hessian.
-
-        :param X: tf1.tensor observations x features
-            Observation by observation and feature.
-        :param loc: tf1.tensor observations x features
-            Value of mean model by observation and feature.
-        :param scale: tf1.tensor observations x features
-            Value of dispersion model by observation and feature.
-
-        :return const: tf1.tensor observations x features
-            Coefficient invariant terms of hessian of
-            given observations and features.
-        """
-        pass
-
-    @abc.abstractmethod
-    def _weight_hessian_bb(
-            self,
-            X,
-            loc,
-            scale
-    ):
-        """
-        Compute the coefficient index invariant part of the
-        dispersion model block of the hessian.
-
-        :param X: tf1.tensor observations x features
-            Observation by observation and feature.
-        :param loc: tf1.tensor observations x features
-            Value of mean model by observation and feature.
-        :param scale: tf1.tensor observations x features
-            Value of dispersion model by observation and feature.
-
-        :return const: tf1.tensor observations x features
-            Coefficient invariant terms of hessian of
-            given observations and features.
-        """
-        pass
-
-    @abc.abstractmethod
-    def _weight_hessian_ab(
-            self,
-            X,
-            loc,
-            scale
-    ):
-        """
-        Compute the coefficient index invariant part of the
-        mean-dispersion model block of the hessian.
-
-        Note that there are two blocks of the same size which can
-        be compute from each other with a transpose operation as
-        the hessian is symmetric.
-
-        :param X: tf1.tensor observations x features
-            Observation by observation and feature.
-        :param loc: tf1.tensor observations x features
-            Value of mean model by observation and feature.
-        :param scale: tf1.tensor observations x features
-            Value of dispersion model by observation and feature.
-
-        :return const: tf1.tensor observations x features
-            Coefficient invariant terms of hessian of
-            given observations and features.
-        """
-        pass
-
diff --git a/batchglm/train/tf1/base_glm/jacobians.py b/batchglm/train/tf1/base_glm/jacobians.py
deleted file mode 100644
index 1eeab1d7..00000000
--- a/batchglm/train/tf1/base_glm/jacobians.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import abc
-import logging
-
-import tensorflow as tf
-
-logger = logging.getLogger(__name__)
-
-
-class JacobiansGLM:
-    """
-    Compute the Jacobian matrix for a GLM.
-    """
-
-    def jac_analytic(
-            self,
-            model
-    ) -> tf.Tensor:
-        raise NotImplementedError()
-
-    def jac_tf(
-            self,
-            model
-    ) -> tf.Tensor:
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def _weights_jac_a(
-            self,
-            X,
-            loc,
-            scale
-    ):
-        """
-        Compute the coefficient index invariant part of the
-        mean model gradient.
-
-        :param X: tf1.tensor observations x features
-            Observation by observation and feature.
-        :param loc: tf1.tensor observations x features
-            Value of mean model by observation and feature.
-        :param scale: tf1.tensor observations x features
-            Value of dispersion model by observation and feature.
-
-        :return const: tf1.tensor observations x features
-            Coefficient invariant terms of hessian of
-            given observations and features.
-        """
-        pass
-
-    @abc.abstractmethod
-    def _weights_jac_b(
-            self,
-            X,
-            loc,
-            scale
-    ):
-        """
-        Compute the coefficient index invariant part of the
-        dispersion model gradient.
-
-        :param X: tf1.tensor observations x features
-            Observation by observation and feature.
-        :param loc: tf1.tensor observations x features
-            Value of mean model by observation and feature.
-        :param scale: tf1.tensor observations x features
-            Value of dispersion model by observation and feature.
-
-        :return const: tf1.tensor observations x features
-            Coefficient invariant terms of hessian of
-            given observations and features.
-        """
-        pass
\ No newline at end of file
diff --git a/batchglm/train/tf1/base_glm/model.py b/batchglm/train/tf1/base_glm/model.py
deleted file mode 100644
index c978cbd0..00000000
--- a/batchglm/train/tf1/base_glm/model.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import abc
-import logging
-from typing import Union
-
-import tensorflow as tf
-import numpy as np
-
-from .external import ProcessModelBase
-
-logger = logging.getLogger(__name__)
-
-
-class ProcessModelGLM(ProcessModelBase):
-
-    @abc.abstractmethod
-    def param_bounds(self, dtype: str):
-        pass
-
-
-class ModelVarsGLM(ProcessModelGLM):
-    """ Build tf1.Variables to be optimzed and their constraints.
-
-    a_var and b_var slices of the tf1.Variable params which contains
-    all parameters to be optimized during model estimation.
-    Params is defined across both location and scale model so that 
-    the hessian can be computed for the entire model.
-    a and b are the clipped parameter values which also contain
-    constraints and constrained dependent coefficients which are not
-    directly optimized.
-    """
-
-    a: tf.Tensor
-    b: tf.Tensor
-    a_var: tf.Variable
-    b_var: tf.Variable
-    params: tf.Variable
-    converged: np.ndarray
-
-    def __init__(
-            self,
-            dtype: str,
-            init_a: np.ndarray,
-            init_b: np.ndarray,
-            constraints_loc: tf.Tensor,
-            constraints_scale: tf.Tensor
-    ):
-        """
-
-        :param dtype: Precision used in tensorflow.
-        :param init_a: nd.array (mean model size x features)
-            Initialisation for all parameters of mean model.
-        :param init_b: nd.array (dispersion model size x features)
-            Initialisation for all parameters of dispersion model.
-        :param constraints_loc: tensor (all parameters x dependent parameters)
-            Tensor that encodes how complete parameter set which includes dependent
-            parameters arises from indepedent parameters: all = <constraints, indep>.
-            This tensor describes this relation for the mean model.
-            This form of constraints is used in vector generalized linear models (VGLMs).
-        :param constraints_scale: tensor (all parameters x dependent parameters)
-            Tensor that encodes how complete parameter set which includes dependent
-            parameters arises from indepedent parameters: all = <constraints, indep>.
-            This tensor describes this relation for the dispersion model.
-            This form of constraints is used in vector generalized linear models (VGLMs).
-        """
-        self.init_a = tf.convert_to_tensor(init_a, dtype=dtype)
-        self.init_b = tf.convert_to_tensor(init_b, dtype=dtype)
-
-        init_a_clipped = self.tf_clip_param(self.init_a, "a_var")
-        init_b_clipped = self.tf_clip_param(self.init_b, "b_var")
-
-        # Param is the only tf1.Variable in the graph.
-        # a_var and b_var have to be slices of params.
-        self.params = tf.Variable(tf.concat(
-            [
-                init_a_clipped,
-                init_b_clipped,
-            ],
-            axis=0
-        ), name="params")
-
-        # Feature batching code for future:
-        #idx_featurebatch = tf1.random_uniform([100], minval=0, maxval=self.params.shape[1]-1, dtype=tf1.int32)
-        #params_featurebatch = tf1.gather(self.params, indi [:,idx_featurebatch]
-
-        #params_by_gene = [tf1.expand_dims(params[:, i], axis=-1) for i in range(params.shape[1])]
-        #a_by_gene = [x[0:init_a.shape[0],:] for x in params_by_gene]
-        #b_by_gene = [x[init_a.shape[0]:, :] for x in params_by_gene]
-        #a_var = tf1.concat(a_by_gene, axis=1)
-        #b_var = tf1.concat(b_by_gene, axis=1)
-        a_var = self.params[0:init_a.shape[0]]
-        b_var = self.params[init_a.shape[0]:]
-
-        self.a_var = self.tf_clip_param(a_var, "a_var")
-        self.b_var = self.tf_clip_param(b_var, "b_var")
-
-        if constraints_loc is not None:
-            self.a = tf.matmul(constraints_loc,  self.a_var)
-        else:
-            self.a = self.a_var
-
-        if constraints_scale is not None:
-            self.b = tf.matmul(constraints_scale,  self.b_var)
-        else:
-            self.b = self.b_var
-
-        # Properties to follow gene-wise convergence.
-        self.updated = tf.Variable(np.repeat(a=True, repeats=self.params.shape[1]))  # Initialise to is updated.
-        self.converged = tf.Variable(np.repeat(a=False, repeats=self.params.shape[1]))  # Initialise to non-converged.
-        self.convergence_status = tf.compat.v1.placeholder(shape=[self.params.shape[1]], dtype=tf.bool)
-        self.convergence_update = tf.compat.v1.assign(self.converged, self.convergence_status)
-        #self.params_by_gene = params_by_gene
-        #self.a_by_gene = a_by_gene
-        #self.b_by_gene = b_by_gene
-
-        self.dtype = dtype
-        self.constraints_loc = constraints_loc
-        self.constraints_scale = constraints_scale
-        self.n_features = self.params.shape[1]
-        self.idx_train_loc = np.arange(0, init_a.shape[0])
-        self.idx_train_scale = np.arange(init_a.shape[0], init_a.shape[0]+init_b.shape[0])
-
-    @abc.abstractmethod
-    def param_bounds(self, dtype):
-        pass
-
-
-class BasicModelGraphGLM(ProcessModelGLM):
-    """
-
-    """
-    X: Union[tf.Tensor, tf.SparseTensor]
-    design_loc: tf.Tensor
-    design_scale: tf.Tensor
-    constraints_loc: tf.Tensor
-    constraints_scale: tf.Tensor
-
-    probs: tf.Tensor
-    log_likelihood: tf.Tensor
-    norm_log_likelihood: tf.Tensor
-    norm_neg_log_likelihood: tf.Tensor
-    loss: tf.Tensor
-
-    @property
-    def probs(self):
-        probs = tf.exp(self.log_probs)
-        return self.tf_clip_param(probs, "probs")
-
-    @property
-    def log_likelihood(self):
-        return tf.reduce_sum(self.log_probs, axis=0, name="log_likelihood")
-
-    @property
-    def norm_log_likelihood(self):
-        return tf.reduce_mean(self.log_probs, axis=0, name="log_likelihood")
-
-    @property
-    def norm_neg_log_likelihood(self):
-        return - self.norm_log_likelihood
-
-    @property
-    def loss(self):
-        return tf.reduce_sum(self.norm_neg_log_likelihood)
-
-    @abc.abstractmethod
-    def param_bounds(self, dtype):
-        pass
diff --git a/batchglm/train/tf1/base_glm/reducible_tensors.py b/batchglm/train/tf1/base_glm/reducible_tensors.py
deleted file mode 100644
index 45b7f753..00000000
--- a/batchglm/train/tf1/base_glm/reducible_tensors.py
+++ /dev/null
@@ -1,351 +0,0 @@
-import logging
-from typing import Union
-
-import tensorflow as tf
-
-from batchglm.train.tf1.base_glm.model import ModelVarsGLM
-
-logger = logging.getLogger("batchglm")
-
-
-class ReducableTensorsGLM:
-    """
-    """
-
-    noise_model: str
-    constraints_loc: tf.Tensor
-    constraints_scale: tf.Tensor
-    model_vars: ModelVarsGLM
-    noise_model: str
-    compute_a: bool
-    compute_b: bool
-
-    jac: Union[tf.Tensor, None]
-    jac_a: Union[tf.Tensor, None]
-    jac_b: Union[tf.Tensor, None]
-    neg_jac: tf.Tensor
-    neg_jac_a: Union[tf.Tensor, None]
-    neg_jac_b: Union[tf.Tensor, None]
-
-    hessian: Union[tf.Tensor, None]
-    hessian_aa: Union[tf.Tensor, None]
-    hessian_bb: Union[tf.Tensor, None]
-    neg_hessian: Union[tf.Tensor, None]
-    neg_hessian_aa: Union[tf.Tensor, None]
-    neg_hessian_bb: Union[tf.Tensor, None]
-
-    fim_a: Union[tf.Tensor, None]
-    fim_b: Union[tf.Tensor, None]
-
-    neg_loglikelihood: Union[tf.Tensor, None]
-
-    def __init__(
-            self,
-            model_vars: ModelVarsGLM,
-            noise_model: str,
-            constraints_loc,
-            constraints_scale,
-            sample_indices = None,
-            data_set: tf.data.Dataset = None,
-            data_batch: tf.Tensor = None,
-            mode_jac="analytic",
-            mode_hessian="analytic",
-            mode_fim="analytic",
-            compute_a=True,
-            compute_b=True,
-            compute_jac=True,
-            compute_hessian=True,
-            compute_fim=True,
-            compute_ll=True
-    ):
-        """ Return computational graph for jacobian based on mode choice.
-
-        :param batched_data:
-            Dataset iterator over mini-batches of data (used for training) or tf1.Tensor of mini-batch.
-        :param sample_indices: Indices of samples to be used.
-        :param constraints_loc: np.ndarray (constraints on mean model x mean model parameters)
-            Constraints for location model.
-            Array with constraints in rows and model parameters in columns.
-            Each constraint contains non-zero entries for the a of parameters that
-            has to sum to zero. This constraint is enforced by binding one parameter
-            to the negative sum of the other parameters, effectively representing that
-            parameter as a function of the other parameters. This dependent
-            parameter is indicated by a -1 in this array, the independent parameters
-            of that constraint (which may be dependent at an earlier constraint)
-            are indicated by a 1.
-        :param constraints_scale: np.ndarray (constraints on mean model x mean model parameters)
-            Constraints for scale model.
-            Array with constraints in rows and model parameters in columns.
-            Each constraint contains non-zero entries for the a of parameters that
-            has to sum to zero. This constraint is enforced by binding one parameter
-            to the negative sum of the other parameters, effectively representing that
-            parameter as a function of the other parameters. This dependent
-            parameter is indicated by a -1 in this array, the independent parameters
-            of that constraint (which may be dependent at an earlier constraint)
-            are indicated by a 1.
-        :param mode: str
-            Mode by with which hessian is to be evaluated,
-            "analytic" uses a closed form solution of the jacobian,
-            "tf1" allows for evaluation of the jacobian via the tf1.gradients function.
-        :param iterator: bool
-            Whether an iterator or a tensor (single yield of an iterator) is given
-            in.
-        :param jac_a: bool
-            Wether to compute Jacobian for a parameters. If both jac_a and jac_b are true,
-            the entire jacobian is computed in self.jac.
-        :param jac_b: bool
-            Wether to compute Jacobian for b parameters. If both jac_a and jac_b are true,
-            the entire jacobian is computed in self.jac.
-        """
-        assert data_set is None or data_batch is None
-
-        self.noise_model = noise_model
-        self.model_vars = model_vars
-        self.constraints_loc = constraints_loc
-        self.constraints_scale = constraints_scale
-
-        self.compute_a = compute_a
-        self.compute_b = compute_b
-
-        self.mode_jac = mode_jac
-        self.mode_hessian = mode_hessian
-        self.mode_fim = mode_fim
-
-        self.compute_jac = compute_jac
-        self.compute_hessian = compute_hessian
-        self.compute_fim_a = compute_fim and compute_a
-        self.compute_fim_b = compute_fim and compute_b
-        self.compute_ll = compute_ll
-
-        n_var_all = self.model_vars.params.shape[0]
-        n_var_a = self.model_vars.a_var.shape[0]
-        n_var_b = self.model_vars.b_var.shape[0]
-        dtype = self.model_vars.dtype
-        self.dtype = dtype
-
-        def map_fun(idx, data):
-            return self.assemble_tensors(
-                idx=idx,
-                data=data
-            )
-
-        def init_fun():
-            if self.compute_a and self.compute_b:
-                n_var_train = n_var_all
-            elif self.compute_a and not self.compute_b:
-                n_var_train = n_var_a
-            elif not self.compute_a and self.compute_b:
-                n_var_train = n_var_b
-            else:
-                n_var_train = 0
-
-            if self.compute_jac and n_var_train > 0:
-                jac_init = tf.zeros([model_vars.n_features, n_var_train], dtype=dtype)
-            else:
-                jac_init = tf.zeros((), dtype=dtype)
-
-            if self.compute_hessian and n_var_train > 0:
-                hessian_init = tf.zeros([model_vars.n_features, n_var_train, n_var_train], dtype=dtype)
-            else:
-                hessian_init = tf.zeros((), dtype=dtype)
-
-            if self.compute_fim_a:
-                fim_a_init = tf.zeros([model_vars.n_features, n_var_a, n_var_a], dtype=dtype)
-            else:
-                fim_a_init = tf.zeros((), dtype=dtype)
-            if self.compute_fim_b:
-                fim_b_init = tf.zeros([model_vars.n_features, n_var_b, n_var_b], dtype=dtype)
-            else:
-                fim_b_init = tf.zeros((), dtype=dtype)
-
-            if self.compute_ll:
-                ll_init = tf.zeros([model_vars.n_features], dtype=dtype)
-            else:
-                ll_init = tf.zeros((), dtype=dtype)
-
-            return jac_init, hessian_init, fim_a_init, fim_b_init, ll_init
-
-        def reduce_fun(old, new):
-            return (tf.add(old[0], new[0]),
-                    tf.add(old[1], new[1]),
-                    tf.add(old[2], new[2]),
-                    tf.add(old[3], new[3]),
-                    tf.add(old[4], new[4]))
-
-        if data_set is not None:
-            set_op = data_set.reduce(
-                initial_state=init_fun(),
-                reduce_func=lambda old, new: reduce_fun(old, map_fun(new[0], new[1]))
-            )
-            jac, hessian, fim_a, fim_b, ll = set_op
-        elif data_batch is not None:
-            set_op = map_fun(
-                idx=sample_indices,
-                data=data_batch
-            )
-            jac, hessian, fim_a, fim_b, ll = set_op
-        else:
-            raise ValueError("supply either data_set or data_batch")
-
-        p_shape_a = self.model_vars.a_var.shape[0]  # This has to be _var to work with constraints.
-
-        # With relay across tf1.Variable:
-        # Containers and specific slices and transforms:
-        if self.compute_a and self.compute_b:
-            if self.compute_jac:
-                self.jac = tf.Variable(tf.zeros([self.model_vars.n_features, n_var_all], dtype=dtype), dtype=dtype)
-                self.jac_a = self.jac[:, :p_shape_a]
-                self.jac_b = self.jac[:, p_shape_a:]
-            else:
-                self.jac = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-                self.jac_a = self.jac
-                self.jac_b = self.jac
-            self.jac_train = self.jac
-
-            if self.compute_hessian:
-                self.hessian = tf.Variable(tf.zeros([self.model_vars.n_features, n_var_all, n_var_all], dtype=dtype), dtype=dtype)
-                self.hessian_aa = self.hessian[:, :p_shape_a, :p_shape_a]
-                self.hessian_bb = self.hessian[:, p_shape_a:, p_shape_a:]
-            else:
-                self.hessian = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-                self.hessian_aa = self.hessian
-                self.hessian_bb = self.hessian
-            self.hessian_train = self.hessian
-
-            if self.compute_fim_a or self.compute_fim_b:
-                self.fim_a = tf.Variable(tf.zeros([self.model_vars.n_features, n_var_a, n_var_a], dtype=dtype), dtype=dtype)
-                self.fim_b = tf.Variable(tf.zeros([self.model_vars.n_features, n_var_b, n_var_b], dtype=dtype), dtype=dtype)
-            else:
-                self.fim_a = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-                self.fim_b = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-        elif self.compute_a and not self.compute_b:
-            if self.compute_jac:
-                self.jac = tf.Variable(tf.zeros([self.model_vars.n_features, n_var_a], dtype=dtype), dtype=dtype)
-                self.jac_a = self.jac
-            else:
-                self.jac = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-                self.jac_a = self.jac
-            self.jac_b = None
-            self.jac_train = self.jac_a
-
-            if self.compute_hessian:
-                self.hessian = tf.Variable(tf.zeros([model_vars.n_features, n_var_a, n_var_a], dtype=dtype), dtype=dtype)
-                self.hessian_aa = self.hessian
-            else:
-                self.hessian = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-                self.hessian_aa = self.hessian
-            self.hessian_bb = None
-            self.hessian_train = self.hessian_aa
-
-            if self.compute_fim_a:
-                self.fim_a = tf.Variable(tf.zeros([model_vars.n_features, n_var_a, n_var_a], dtype=dtype), dtype=dtype)
-            else:
-                self.fim_a = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-            self.fim_b = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-        elif not self.compute_a and self.compute_b:
-            if self.compute_jac:
-                self.jac = tf.Variable(tf.zeros([self.model_vars.n_features, n_var_b], dtype=dtype), dtype=dtype)
-                self.jac_b = self.jac
-            else:
-                self.jac = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-                self.jac_b = self.jac
-            self.jac_a = None
-            self.jac_train = self.jac_b
-
-            if self.compute_hessian:
-                self.hessian = tf.Variable(tf.zeros([model_vars.n_features, n_var_b, n_var_b], dtype=dtype), dtype=dtype)
-                self.hessian_bb = self.hessian
-            else:
-                self.hessian = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-                self.hessian_bb = self.hessian
-            self.hessian_aa = None
-            self.hessian_train = self.hessian_bb
-
-            self.fim_a = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-            if self.compute_fim_b:
-                self.fim_b = tf.Variable(tf.zeros([model_vars.n_features, n_var_b, n_var_b], dtype=dtype), dtype=dtype)
-            else:
-                self.fim_b = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-        else:
-            self.jac = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-            self.jac_a = None
-            self.jac_b = None
-            self.jac_train = None
-
-            self.hessian = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-            self.hessian_aa = None
-            self.hessian_bb = None
-            self.hessian_train = None
-
-            self.fim_a = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-            self.fim_b = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-
-        if self.compute_ll:
-            self.ll = tf.Variable(tf.zeros([model_vars.n_features], dtype=dtype), dtype=dtype)
-        else:
-            self.ll = tf.Variable(tf.zeros((), dtype=dtype), dtype=dtype)
-
-        self.neg_jac = tf.negative(self.jac) if self.jac is not None else None
-        self.neg_jac_a = tf.negative(self.jac_a) if self.jac_a is not None else None
-        self.neg_jac_b = tf.negative(self.jac_b) if self.jac_b is not None else None
-        self.neg_jac_train = tf.negative(self.jac_train) if self.jac_train is not None else None
-
-        self.neg_hessian = tf.negative(self.hessian) if self.hessian is not None else None
-        self.neg_hessian_aa = tf.negative(self.hessian_aa) if self.hessian_aa is not None else None
-        self.neg_hessian_bb = tf.negative(self.hessian_bb) if self.hessian_bb is not None else None
-        self.neg_hessian_train = tf.negative(self.hessian_train) if self.hessian_train is not None else None
-
-        self.neg_ll = tf.negative(self.ll) if self.ll is not None else None
-
-        # Setting operation:
-        jac_set = tf.compat.v1.assign(self.jac, jac)
-        hessian_set = tf.compat.v1.assign(self.hessian, hessian)
-        fim_a_set = tf.compat.v1.assign(self.fim_a, fim_a)
-        fim_b_set = tf.compat.v1.assign(self.fim_b, fim_b)
-        ll_set = tf.compat.v1.assign(self.ll, ll)
-
-        self.set = tf.group(
-            set_op,
-            jac_set,
-            hessian_set,
-            fim_a_set,
-            fim_b_set,
-            ll_set
-        )
-
-    def assemble_tensors(
-        self,
-        idx,
-        data
-    ):
-        raise NotImplementedError()
-
-    def jac_analytic(
-            self,
-            model
-    ) -> tf.Tensor:
-        raise NotImplementedError()
-
-    def jac_tf(
-            self,
-            model
-    ) -> tf.Tensor:
-        raise NotImplementedError()
-
-    def hessian_analytic(
-            self,
-            model
-    ) -> tf.Tensor:
-        raise NotImplementedError()
-
-    def hessian_tf(
-            self,
-            model
-    ) -> tf.Tensor:
-        raise NotImplementedError()
-
-    def fim_analytic(
-            self,
-            model
-    ) -> tf.Tensor:
-        raise NotImplementedError()
\ No newline at end of file
diff --git a/batchglm/train/tf1/base_glm_all/README.md b/batchglm/train/tf1/base_glm_all/README.md
deleted file mode 100644
index 730604f9..00000000
--- a/batchglm/train/tf1/base_glm_all/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# Classes with conditinoal import statements that yield class properties of desired kind.
-For example: EstimatorGraph receives a child of BasicModelGraphGLM appropriate for the desired noise model. This is necessary in a separate module as the lengthy constructor calls to the noise model specific children of GLM classes would otherwise have to be repeated in the class definition of each noise model.
\ No newline at end of file
diff --git a/batchglm/train/tf1/base_glm_all/__init__.py b/batchglm/train/tf1/base_glm_all/__init__.py
deleted file mode 100644
index 3b80760c..00000000
--- a/batchglm/train/tf1/base_glm_all/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .estimator import TFEstimatorGLM
-from .estimator_graph import EstimatorGraphAll
-from .fim import FIMGLMALL
-from .jacobians import JacobiansGLMALL
-from .hessians import HessianGLMALL
-from .reducible_tensors import ReducableTensorsGLMALL
\ No newline at end of file
diff --git a/batchglm/train/tf1/base_glm_all/estimator.py b/batchglm/train/tf1/base_glm_all/estimator.py
deleted file mode 100644
index 7c5cd720..00000000
--- a/batchglm/train/tf1/base_glm_all/estimator.py
+++ /dev/null
@@ -1,362 +0,0 @@
-import abc
-from enum import Enum
-import logging
-import numpy as np
-import scipy.sparse
-import tensorflow as tf
-from typing import Union
-
-from .estimator_graph import EstimatorGraphAll
-from .external import _TFEstimator, InputDataGLM, _EstimatorGLM
-
-
-class TFEstimatorGLM(_TFEstimator, _EstimatorGLM, metaclass=abc.ABCMeta):
-    """
-    Estimator for Generalized Linear Models (GLMs).
-    """
-
-    class TrainingStrategy(Enum):
-        pass
-
-    model: EstimatorGraphAll
-    _train_loc: bool
-    _train_scale: bool
-
-    def __init__(
-            self,
-            input_data: InputDataGLM,
-            batch_size: int,
-            graph: tf.Graph,
-            init_a: Union[np.ndarray],
-            init_b: Union[np.ndarray],
-            model: EstimatorGraphAll,
-            provide_optimizers: dict,
-            provide_batched: bool,
-            provide_fim: bool,
-            provide_hessian: bool,
-            extended_summary,
-            noise_model: str,
-            dtype: str
-    ):
-        """
-        Create a new estimator for a GLM-like model.
-
-        :param input_data: InputData
-            The input data
-        :param batch_size: int
-            Size of mini-batches used.
-        :param graph: (optional) tf1.Graph
-        :param init_model: (optional)
-            If provided, this model will be used to initialize this Estimator.
-        :param init_a: np.ndarray
-            Initialization of 'a' (location) model.
-        :param init_b: np.ndarray
-            Initialization of 'b' (scale) model.
-        :param quick_scale: bool
-            Whether `scale` will be fitted faster and maybe less accurate.
-        :param model: EstimatorGraph
-            EstimatorGraph to use. Basically for debugging.
-        :param provide_optimizers:
-
-            E.g.    {"gd": False, "adam": False, "adagrad": False, "rmsprop": False,
-                    "nr": False, "nr_tr": True, "irls": False, "irls_tr": False}
-        :param provide_batched: bool
-            Whether mini-batched optimizers should be provided.
-        :param extended_summary: Include detailed information in the summaries.
-            Will increase runtime of summary writer, use only for debugging.
-        :param dtype: Precision used in tensorflow.
-        """
-        if noise_model == "nb":
-            from .external_nb import EstimatorGraph
-        elif noise_model == "norm":
-            from .external_norm import EstimatorGraph
-        elif noise_model == "beta":
-            from .external_beta import EstimatorGraph
-        else:
-            raise ValueError("noise model %s was not recognized" % noise_model)
-        self.noise_model = noise_model
-
-        # validate design matrix:
-        if np.linalg.matrix_rank(input_data.design_loc) != np.linalg.matrix_rank(input_data.design_loc.T):
-            raise ValueError("design_loc matrix is not full rank")
-        if np.linalg.matrix_rank(input_data.design_scale) != np.linalg.matrix_rank(input_data.design_scale.T):
-            raise ValueError("design_scale matrix is not full rank")
-
-        # ### initialization
-        if model is None:
-            if graph is None:
-                graph = tf.Graph()
-
-        # ### prepare fetch_fn:
-        def fetch_fn(idx):
-            r"""
-            Documentation of tensorflow coding style in this function:
-            tf1.py_func defines a python function (the getters of the InputData object slots)
-            as a tensorflow operation. Here, the shape of the tensor is lost and
-            has to be set with set_shape. For size factors, we use explicit broadcasting
-            as explained below.
-            """
-            # Catch dimension collapse error if idx is only one element long, ie. 0D:
-            if len(idx.shape) == 0:
-                idx = tf.expand_dims(idx, axis=0)
-
-            if isinstance(input_data.x, scipy.sparse.csr_matrix):
-                X_tensor_idx, X_tensor_val, X_shape = tf.py_function(
-                    func=input_data.fetch_x_sparse,
-                    inp=[idx],
-                    Tout=[np.int64, np.float64, np.int64]
-                )
-                # Note on Tout: np.float64 for val seems to be required to avoid crashing v1.12.
-                X_tensor_idx = tf.cast(X_tensor_idx, dtype=tf.int64)
-                X_shape = tf.cast(X_shape, dtype=tf.int64)
-                X_tensor_val = tf.cast(X_tensor_val, dtype=dtype)
-                X_tensor = (X_tensor_idx, X_tensor_val, X_shape)
-            else:
-                X_tensor = tf.py_function(
-                    func=input_data.fetch_x_dense,
-                    inp=[idx],
-                    Tout=input_data.x.dtype
-                )
-                X_tensor.set_shape(idx.get_shape().as_list() + [input_data.num_features])
-                X_tensor = (tf.cast(X_tensor, dtype=dtype),)
-
-            design_loc_tensor = tf.py_function(
-                func=input_data.fetch_design_loc,
-                inp=[idx],
-                Tout=input_data.design_loc.dtype
-            )
-            design_loc_tensor.set_shape(idx.get_shape().as_list() + [input_data.num_design_loc_params])
-            design_loc_tensor = tf.cast(design_loc_tensor, dtype=dtype)
-
-            design_scale_tensor = tf.py_function(
-                func=input_data.fetch_design_scale,
-                inp=[idx],
-                Tout=input_data.design_scale.dtype
-            )
-            design_scale_tensor.set_shape(idx.get_shape().as_list() + [input_data.num_design_scale_params])
-            design_scale_tensor = tf.cast(design_scale_tensor, dtype=dtype)
-
-            if input_data.size_factors is not None and noise_model in ["nb", "norm"]:
-                size_factors_tensor = tf.py_function(
-                    func=input_data.fetch_size_factors,
-                    inp=[idx],
-                    Tout=input_data.size_factors.dtype
-                )
-                size_factors_tensor.set_shape(idx.get_shape())
-                size_factors_tensor = tf.expand_dims(size_factors_tensor, axis=-1)
-                size_factors_tensor = tf.cast(size_factors_tensor, dtype=dtype)
-            else:
-                size_factors_tensor = tf.constant(1, shape=[1, 1], dtype=dtype)
-
-            size_factors_tensor = tf.broadcast_to(size_factors_tensor,
-                                                  shape=[tf.size(idx), input_data.num_features])
-
-            # return idx, data
-            return idx, (X_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor)
-
-        _TFEstimator.__init__(
-            self=self
-        )
-        with graph.as_default():
-            # create model
-            model = EstimatorGraph(
-                fetch_fn=fetch_fn,
-                feature_isnonzero=input_data.feature_isnonzero,
-                num_observations=input_data.num_observations,
-                num_features=input_data.num_features,
-                num_design_loc_params=input_data.num_design_loc_params,
-                num_design_scale_params=input_data.num_design_scale_params,
-                num_loc_params=input_data.num_loc_params,
-                num_scale_params=input_data.num_scale_params,
-                batch_size=np.min([batch_size, input_data.x.shape[0]]),
-                graph=graph,
-                init_a=init_a,
-                init_b=init_b,
-                constraints_loc=input_data.constraints_loc,
-                constraints_scale=input_data.constraints_scale,
-                provide_optimizers=provide_optimizers,
-                provide_batched=provide_batched,
-                provide_fim=provide_fim,
-                provide_hessian=provide_hessian,
-                train_loc=self._train_loc,
-                train_scale=self._train_scale,
-                extended_summary=extended_summary,
-                noise_model=self.noise_model,
-                dtype=dtype
-            )
-        model.session = self.session
-        _EstimatorGLM.__init__(
-            self=self,
-            model=model,
-            input_data=input_data
-        )
-
-    def _scaffold(self):
-        with self.model.graph.as_default():
-            scaffold = tf.compat.v1.train.Scaffold(
-                init_op=self.model.init_op,
-                summary_op=self.model.merged_summary,
-                saver=self.model.saver,
-            )
-        return scaffold
-
-    def train(
-            self,
-            *args,
-            learning_rate=None,
-            convergence_criteria="all_converged",
-            stopping_criteria=None,
-            train_loc: bool = None,
-            train_scale: bool = None,
-            use_batching=False,
-            optim_algo=None,
-            **kwargs
-    ):
-        r"""
-        Starts training of the model
-
-        :param feed_dict: dict of values which will be feeded each `session.run()`
-
-            See also feed_dict parameter of `session.run()`.
-        :param learning_rate: learning rate used for optimization
-        :param convergence_criteria: criteria after which the training will be interrupted.
-            Currently implemented criterias:
-
-            - "step":
-              stop, when the step counter reaches `stopping_criteria`
-        :param stopping_criteria: Additional parameter for convergence criteria.
-
-            See parameter `convergence_criteria` for exact meaning
-        :param train_loc: Set to True/False in order to enable/disable training of loc
-        :param train_scale: Set to True/False in order to enable/disable training of scale
-        :param use_batching: If True, will use mini-batches with the batch size defined in the constructor.
-            Otherwise, the gradient of the full dataset will be used.
-        :param optim_algo: name of the requested train op.
-            See :func:train_utils.MultiTrainer.train_op_by_name for further details.
-        """
-        if train_loc is None:
-            # check if mu was initialized with MLE
-            train_loc = self._train_loc
-        if train_scale is None:
-            # check if r was initialized with MLE
-            train_scale = self._train_scale
-
-        # Check whether newton-rhapson is desired:
-        require_hessian = False
-        require_fim = False
-        trustregion_mode = False
-
-        if optim_algo.lower() == "newton" or \
-                optim_algo.lower() == "newton_raphson" or \
-                optim_algo.lower() == "nr":
-            require_hessian = True
-
-        if optim_algo.lower() == "irls" or \
-                optim_algo.lower() == "iwls" or \
-                optim_algo.lower() == "irls_gd" or \
-                optim_algo.lower() == "iwls_gd":
-            require_fim = True
-
-        if optim_algo.lower() == "newton_tr" or \
-                optim_algo.lower() == "nr_tr":
-            require_hessian = True
-            trustregion_mode = True
-
-        if optim_algo.lower() == "irls_tr" or \
-                optim_algo.lower() == "iwls_tr" or \
-                optim_algo.lower() == "irls_gd_tr" or \
-                optim_algo.lower() == "iwls_gd_tr":
-            require_fim = True
-            trustregion_mode = True
-
-        # Set learning rate defaults if not set by user.
-        if learning_rate is None:
-            if require_hessian or require_fim:
-                learning_rate = 1
-            else:
-                learning_rate = 0.5
-
-        # Check that newton-rhapson is called properly:
-        if require_hessian or require_fim:
-            if learning_rate != 1:
-                logging.getLogger("batchglm").warning(
-                    "Newton-rhapson or IRLS in base_glm_all is used with learning rate " +
-                    str(learning_rate) +
-                    ". Newton-rhapson and IRLS should only be used with learning rate = 1."
-                )
-
-        # Report all parameters after all defaults were imputed in settings:
-        logging.getLogger("batchglm").debug("Optimizer settings in base_glm_all Estimator.train():")
-        logging.getLogger("batchglm").debug("learning_rate " + str(learning_rate))
-        logging.getLogger("batchglm").debug("convergence_criteria " + str(convergence_criteria))
-        logging.getLogger("batchglm").debug("stopping_criteria " + str(stopping_criteria))
-        logging.getLogger("batchglm").debug("train_loc " + str(train_loc))
-        logging.getLogger("batchglm").debug("train_scale " + str(train_scale))
-        logging.getLogger("batchglm").debug("use_batching " + str(use_batching))
-        logging.getLogger("batchglm").debug("optim_algo " + str(optim_algo))
-        if len(kwargs) > 0:
-            logging.getLogger("batchglm").debug("**kwargs: ")
-            logging.getLogger("batchglm").debug(kwargs)
-
-        if train_loc or train_scale:
-            if use_batching:
-                train_op = self.model.trainer_batch.train_op_by_name(optim_algo)
-            else:
-                train_op = self.model.trainer_full.train_op_by_name(optim_algo)
-
-            super()._train(
-                *args,
-                feed_dict={"learning_rate:0": learning_rate},
-                convergence_criteria=convergence_criteria,
-                stopping_criteria=stopping_criteria,
-                train_op=train_op,
-                trustregion_mode=trustregion_mode,
-                require_hessian=require_hessian,
-                require_fim=require_fim,
-                is_batched=use_batching,
-                **kwargs
-            )
-
-    def finalize(self):
-        """
-        Evaluate all tensors that need to be exported from session and save these as class attributes
-        and close session.
-
-        Changes .model entry from tf1-based EstimatorGraph to numpy based Model instance and
-        transfers relevant attributes.
-        """
-        self.session.run(self.model.full_data_model.final_set)
-        a_var = self.session.run(self.model.a_var)
-        b_var = self.session.run(self.model.b_var)
-        fisher_inv = self.session.run(self.model.fisher_inv)
-        hessian = self.session.run(self.model.hessian)
-        jacobian = self.session.run(self.model.gradients)
-        log_likelihood = self.session.run(self.model.log_likelihood)
-        loss = self.session.run(self.model.loss)
-        logging.getLogger("batchglm").debug("Closing session")
-        self.close_session()
-        self.model = self.get_model_container(self.input_data)
-        self.model._a_var = a_var
-        self.model._b_var = b_var
-        self._fisher_inv = fisher_inv
-        self._hessian = hessian
-        self._jacobian = jacobian
-        self._log_likelihood = log_likelihood
-        self._loss = loss
-
-    @abc.abstractmethod
-    def get_model_container(
-            self,
-            input_data
-    ):
-        pass
-
-    @abc.abstractmethod
-    def init_par(
-            self,
-            input_data,
-            init_a,
-            init_b,
-            init_model
-    ):
-        pass
diff --git a/batchglm/train/tf1/base_glm_all/estimator_graph.py b/batchglm/train/tf1/base_glm_all/estimator_graph.py
deleted file mode 100644
index 1a05d6a5..00000000
--- a/batchglm/train/tf1/base_glm_all/estimator_graph.py
+++ /dev/null
@@ -1,543 +0,0 @@
-import logging
-import numpy as np
-import tensorflow as tf
-from typing import Union
-
-from .external import EstimatorGraphGLM, FullDataModelGraphGLM, BatchedDataModelGraphGLM, ModelVarsGLM
-from .external import pkg_constants
-
-logger = logging.getLogger(__name__)
-
-
-class FullDataModelGraph(FullDataModelGraphGLM):
-    """
-    Computational graph to evaluate GLM metrics on full data set.
-
-    Evaluate model and cost function, Jacobians, Hessians and Fisher information matrix.
-    """
-
-    def __init__(
-            self,
-            num_observations,
-            sample_indices: tf.Tensor,
-            fetch_fn,
-            batch_size: Union[int, tf.Tensor],
-            model_vars: ModelVarsGLM,
-            constraints_loc,
-            constraints_scale,
-            noise_model,
-            train_a,
-            train_b,
-            compute_fim,
-            compute_hessian,
-            dtype
-    ):
-        """
-        :param sample_indices:
-            TODO
-        :param fetch_fn:
-            TODO
-        :param batch_size: int
-            Size of mini-batches used.
-        :param model_vars: ModelVars
-            Variables of model. Contains tf1.Variables which are optimized.
-        :param constraints_loc: tensor (all parameters x dependent parameters)
-            Tensor that encodes how complete parameter set which includes dependent
-            parameters arises from indepedent parameters: all = <constraints, indep>.
-            This tensor describes this relation for the mean model.
-            This form of constraints is used in vector generalized linear models (VGLMs).
-        :param constraints_scale: tensor (all parameters x dependent parameters)
-            Tensor that encodes how complete parameter set which includes dependent
-            parameters arises from indepedent parameters: all = <constraints, indep>.
-            This tensor describes this relation for the dispersion model.
-            This form of constraints is used in vector generalized linear models (VGLMs).
-        :param train_mu: bool
-            Whether to train mean model. If False, the initialisation is kept.
-        :param train_r: bool
-            Whether to train dispersion model. If False, the initialisation is kept.
-        :param dtype: Precision used in tensorflow.
-        """
-        if noise_model == "nb":
-            from .external_nb import ReducibleTensors
-        elif noise_model == "norm":
-            from .external_norm import ReducibleTensors
-        elif noise_model == "beta":
-            from .external_beta import ReducibleTensors
-        else:
-            raise ValueError("noise model not recognized")
-        self.noise_model = noise_model
-
-        logger.debug("building input pipeline")
-        with tf.name_scope("input_pipeline"):
-            data_set = tf.data.Dataset.from_tensor_slices(sample_indices)
-            data_set = data_set.batch(batch_size)
-            data_set = data_set.map(fetch_fn, num_parallel_calls=pkg_constants.TF_NUM_THREADS)
-
-            def map_sparse(idx, data):
-                X_tensor_ls, design_loc_tensor, design_scale_tensor, size_factors_tensor = data
-                if len(X_tensor_ls) > 1:
-                    X_tensor = tf.SparseTensor(X_tensor_ls[0], X_tensor_ls[1], X_tensor_ls[2])
-                    X_tensor = tf.cast(X_tensor, dtype=dtype)
-                else:
-                    X_tensor = X_tensor_ls[0]
-                return idx, (X_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor)
-
-            data_set = data_set.map(map_sparse, num_parallel_calls=pkg_constants.TF_NUM_THREADS)
-            data_set = data_set.prefetch(1)
-
-        with tf.name_scope("reducible_tensors_train"):
-            reducibles_train = ReducibleTensors(
-                model_vars=model_vars,
-                noise_model=noise_model,
-                constraints_loc=constraints_loc,
-                constraints_scale=constraints_scale,
-                sample_indices=sample_indices,
-                data_set=data_set,
-                data_batch=None,
-                mode_jac=pkg_constants.JACOBIAN_MODE,
-                mode_hessian=pkg_constants.HESSIAN_MODE,
-                mode_fim=pkg_constants.FIM_MODE,
-                compute_a=train_a,
-                compute_b=train_b,
-                compute_jac=True,
-                compute_hessian=compute_hessian,
-                compute_fim=compute_fim,
-                compute_ll=False
-            )
-            self.neg_jac_train = reducibles_train.neg_jac_train
-            self.jac = reducibles_train.jac
-            self.neg_jac_a = reducibles_train.neg_jac_a
-            self.neg_jac_b = reducibles_train.neg_jac_b
-            self.jac_b = reducibles_train.jac_b
-
-            self.hessians = reducibles_train.hessian
-            self.neg_hessians_train = reducibles_train.neg_hessian_train
-
-            self.fim_a = reducibles_train.fim_a
-            self.fim_b = reducibles_train.fim_b
-
-            self.train_set = reducibles_train.set
-
-        with tf.name_scope("reducible_tensors_finalize"):
-            reducibles_finalize = ReducibleTensors(
-                model_vars=model_vars,
-                noise_model=noise_model,
-                constraints_loc=constraints_loc,
-                constraints_scale=constraints_scale,
-                sample_indices=sample_indices,
-                data_set=data_set,
-                data_batch=None,
-                mode_jac=pkg_constants.JACOBIAN_MODE,
-                mode_hessian=pkg_constants.HESSIAN_MODE,
-                mode_fim=pkg_constants.FIM_MODE,
-                compute_a=True,
-                compute_b=True,
-                compute_jac=True,
-                compute_hessian=True,
-                compute_fim=False,
-                compute_ll=True
-            )
-            self.hessians_final = reducibles_finalize.hessian
-            self.neg_jac_final = reducibles_finalize.neg_jac
-            self.log_likelihood_final = reducibles_finalize.ll
-            self.loss_final = tf.reduce_sum(-self.log_likelihood_final / num_observations)
-
-            self.final_set = reducibles_finalize.set
-
-        with tf.name_scope("reducible_tensors_eval_ll"):
-            reducibles_eval0 = ReducibleTensors(
-                model_vars=model_vars,
-                noise_model=noise_model,
-                constraints_loc=constraints_loc,
-                constraints_scale=constraints_scale,
-                sample_indices=sample_indices,
-                data_set=data_set,
-                data_batch=None,
-                mode_jac=pkg_constants.JACOBIAN_MODE,
-                mode_hessian=pkg_constants.HESSIAN_MODE,
-                mode_fim=pkg_constants.FIM_MODE,
-                compute_a=train_a,
-                compute_b=train_b,
-                compute_jac=False,
-                compute_hessian=False,
-                compute_fim=False,
-                compute_ll=True
-            )
-            self.log_likelihood_eval0 = reducibles_eval0.ll
-            self.norm_neg_log_likelihood_eval0 = -self.log_likelihood_eval0 / num_observations
-            self.loss_eval0 = tf.reduce_sum(self.norm_neg_log_likelihood_eval0)
-
-            self.eval0_set = reducibles_eval0.set
-
-        with tf.name_scope("reducible_tensors_eval_ll_jac"):
-            reducibles_eval1 = ReducibleTensors(
-                model_vars=model_vars,
-                noise_model=noise_model,
-                constraints_loc=constraints_loc,
-                constraints_scale=constraints_scale,
-                sample_indices=sample_indices,
-                data_set=data_set,
-                data_batch=None,
-                mode_jac=pkg_constants.JACOBIAN_MODE,
-                mode_hessian=pkg_constants.HESSIAN_MODE,
-                mode_fim=pkg_constants.FIM_MODE,
-                compute_a=train_a,
-                compute_b=train_b,
-                compute_jac=True,
-                compute_hessian=False,
-                compute_fim=False,
-                compute_ll=True
-            )
-            self.log_likelihood_eval1 = reducibles_eval1.ll
-            self.norm_neg_log_likelihood_eval1 = -self.log_likelihood_eval1 / num_observations
-            self.loss_eval1 = tf.reduce_sum(self.norm_neg_log_likelihood_eval1)
-            self.neg_jac_train_eval = reducibles_eval1.neg_jac_train
-
-            self.eval1_set = reducibles_eval1.set
-
-        self.num_observations = num_observations
-        self.idx_train_loc = model_vars.idx_train_loc if train_a else np.array([])
-        self.idx_train_scale = model_vars.idx_train_scale if train_b else np.array([])
-        self.idx_train = np.sort(np.concatenate([self.idx_train_loc, self.idx_train_scale]))
-
-
-class BatchedDataModelGraph(BatchedDataModelGraphGLM):
-    """
-    Basic computational graph to evaluate GLM metrics on batched data set.
-
-    Evaluate model and cost function and Jacobians, Hessians and Fisher information matrix.
-    """
-
-    def __init__(
-            self,
-            num_observations,
-            fetch_fn,
-            batch_size: Union[int, tf.Tensor],
-            buffer_size: int,
-            model_vars: ModelVarsGLM,
-            constraints_loc,
-            constraints_scale,
-            noise_model: str,
-            train_a,
-            train_b,
-            compute_fim,
-            compute_hessian,
-            dtype
-    ):
-        """
-        :param fetch_fn:
-            TODO
-        :param batch_size: int
-            Size of mini-batches used.
-        :param model_vars: ModelVars
-            Variables of model. Contains tf1.Variables which are optimized.
-        :param constraints_loc: tensor (all parameters x dependent parameters)
-            Tensor that encodes how complete parameter set which includes dependent
-            parameters arises from indepedent parameters: all = <constraints, indep>.
-            This tensor describes this relation for the mean model.
-            This form of constraints is used in vector generalized linear models (VGLMs).
-        :param constraints_scale: tensor (all parameters x dependent parameters)
-            Tensor that encodes how complete parameter set which includes dependent
-            parameters arises from indepedent parameters: all = <constraints, indep>.
-            This tensor describes this relation for the dispersion model.
-            This form of constraints is used in vector generalized linear models (VGLMs).
-        :param dtype: Precision used in tensorflow.
-        """
-        if noise_model == "nb":
-            from .external_nb import ReducibleTensors
-        elif noise_model == "norm":
-            from .external_norm import ReducibleTensors
-        elif noise_model == "beta":
-            from .external_beta import ReducibleTensors
-        else:
-            raise ValueError("noise model not recognized")
-        self.noise_model = noise_model
-
-        with tf.name_scope("input_pipeline"):
-            data_set = tf.data.Dataset.from_tensor_slices((
-                tf.range(num_observations, name="sample_index")
-            ))
-            data_set = data_set.shuffle(buffer_size=2 * batch_size)
-            data_set = data_set.repeat()
-            data_set = data_set.batch(batch_size, drop_remainder=True)
-            data_set = data_set.map(tf.contrib.framework.sort)  # sort indices - TODO why?
-            data_set = data_set.map(fetch_fn, num_parallel_calls=pkg_constants.TF_NUM_THREADS)
-            data_set = data_set.prefetch(buffer_size)
-
-            def map_sparse(idx, data_batch):
-                X_tensor_ls, design_loc_tensor, design_scale_tensor, size_factors_tensor = data_batch
-                if len(X_tensor_ls) > 1:
-                    X_tensor = tf.SparseTensor(X_tensor_ls[0], X_tensor_ls[1], X_tensor_ls[2])
-                    X_tensor = tf.cast(X_tensor, dtype=dtype)
-                else:
-                    X_tensor = X_tensor_ls[0]
-                return idx, (X_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor)
-
-            data_set = data_set.map(map_sparse, num_parallel_calls=pkg_constants.TF_NUM_THREADS)
-            iterator = tf.compat.v1.data.make_one_shot_iterator(data_set)
-
-            batch_sample_index, batch_data = iterator.get_next()
-
-        with tf.name_scope("reducible_tensors_train"):
-            reducibles_train = ReducibleTensors(
-                model_vars=model_vars,
-                noise_model=noise_model,
-                constraints_loc=constraints_loc,
-                constraints_scale=constraints_scale,
-                sample_indices=batch_sample_index,
-                data_set=None,
-                data_batch=batch_data,
-                mode_jac=pkg_constants.JACOBIAN_MODE,
-                mode_hessian=pkg_constants.HESSIAN_MODE,
-                mode_fim=pkg_constants.FIM_MODE,
-                compute_a=train_a,
-                compute_b=train_b,
-                compute_jac=True,
-                compute_hessian=compute_hessian,
-                compute_fim=compute_fim,
-                compute_ll=False
-            )
-
-            self.neg_jac_train = reducibles_train.neg_jac_train
-            self.jac = reducibles_train.jac
-            self.neg_jac_a = reducibles_train.neg_jac_a
-            self.neg_jac_b = reducibles_train.neg_jac_b
-            self.jac_b = reducibles_train.jac_b
-
-            self.hessians = reducibles_train.hessian
-            self.neg_hessians_train = reducibles_train.neg_hessian_train
-
-            self.fim_a = reducibles_train.fim_a
-            self.fim_b = reducibles_train.fim_b
-
-            self.train_set = reducibles_train.set
-
-        with tf.name_scope("reducible_tensors_eval"):
-            reducibles_eval = ReducibleTensors(
-                model_vars=model_vars,
-                noise_model=noise_model,
-                constraints_loc=constraints_loc,
-                constraints_scale=constraints_scale,
-                sample_indices=batch_sample_index,
-                data_set=None,
-                data_batch=batch_data,
-                mode_jac=pkg_constants.JACOBIAN_MODE,
-                mode_hessian=pkg_constants.HESSIAN_MODE,
-                mode_fim=pkg_constants.FIM_MODE,
-                compute_a=True,
-                compute_b=True,
-                compute_jac=True,
-                compute_hessian=False,
-                compute_fim=False,
-                compute_ll=True
-            )
-
-            self.log_likelihood = reducibles_eval.ll
-            self.norm_log_likelihood = self.log_likelihood / num_observations
-            self.norm_neg_log_likelihood = -self.norm_log_likelihood
-            self.loss = tf.reduce_sum(self.norm_neg_log_likelihood)
-
-            self.neg_jac_train_eval = reducibles_train.neg_jac_train
-
-            self.eval_set = reducibles_eval.set
-
-        self.num_observations = num_observations
-        self.idx_train_loc = model_vars.idx_train_loc if train_a else np.array([])
-        self.idx_train_scale = model_vars.idx_train_scale if train_b else np.array([])
-        self.idx_train = np.sort(np.concatenate([self.idx_train_loc, self.idx_train_scale]))
-
-
-class EstimatorGraphAll(EstimatorGraphGLM):
-    """
-
-    Contains model_vars, full_data_model and batched_data_model which are the
-    primary training objects. All three also exist as *_eval which can be used
-    to perform and iterative optmization within a single parameter update, such
-    as during a line search.
-    """
-
-    mu: tf.Tensor
-    sigma2: tf.Tensor
-
-    def __init__(
-            self,
-            fetch_fn,
-            feature_isnonzero,
-            num_observations,
-            num_features,
-            num_design_loc_params,
-            num_design_scale_params,
-            num_loc_params,
-            num_scale_params,
-            constraints_loc: np.ndarray,
-            constraints_scale: np.ndarray,
-            graph: tf.Graph,
-            batch_size: int,
-            init_a,
-            init_b,
-            train_loc: bool,
-            train_scale: bool,
-            provide_optimizers: Union[dict, None],
-            provide_batched: bool,
-            provide_hessian: bool,
-            provide_fim: bool,
-            extended_summary: bool,
-            noise_model: str,
-            dtype: str
-    ):
-        """
-
-        :param fetch_fn:
-            TODO
-        :param feature_isnonzero:
-            Whether all observations of a feature are zero. Features for which this
-            is the case are not fitted.
-        :param num_observations: int
-            Number of observations.
-        :param num_features: int
-            Number of features.
-        :param num_design_loc_params: int
-            Number of parameters per feature in mean model.
-        :param num_design_scale_params: int
-            Number of parameters per feature in scale model.
-        :param graph: tf1.Graph
-        :param batch_size: int
-            Size of mini-batches used.
-        :param init_a: nd.array (mean model size x features)
-            Initialisation for all parameters of mean model.
-        :param init_b: nd.array (dispersion model size x features)
-            Initialisation for all parameters of dispersion model.
-        :param constraints_loc: tensor (all parameters x dependent parameters)
-            Tensor that encodes how complete parameter set which includes dependent
-            parameters arises from indepedent parameters: all = <constraints, indep>.
-            This tensor describes this relation for the mean model.
-            This form of constraints is used in vector generalized linear models (VGLMs).
-        :param constraints_scale: tensor (all parameters x dependent parameters)
-            Tensor that encodes how complete parameter set which includes dependent
-            parameters arises from indepedent parameters: all = <constraints, indep>.
-            This tensor describes this relation for the dispersion model.
-            This form of constraints is used in vector generalized linear models (VGLMs).
-        :param train_loc: bool
-            Whether to train mean model. If False, the initialisation is kept.
-        :param train_scale: bool
-            Whether to train dispersion model. If False, the initialisation is kept.
-        :param provide_optimizers:
-        :param extended_summary:
-        :param dtype: Precision used in tensorflow.
-        """
-        if noise_model == "nb":
-            from .external_nb import ModelVars
-        elif noise_model == "norm":
-            from .external_norm import ModelVars
-        elif noise_model == "beta":
-            from .external_beta import ModelVars
-        else:
-            raise ValueError("noise model not recognized")
-        self.noise_model = noise_model
-
-        EstimatorGraphGLM.__init__(
-            self=self,
-            num_observations=num_observations,
-            num_features=num_features,
-            num_design_loc_params=num_design_loc_params,
-            num_design_scale_params=num_design_scale_params,
-            num_loc_params=num_loc_params,
-            num_scale_params=num_scale_params,
-            graph=graph,
-            batch_size=batch_size,
-            constraints_loc=constraints_loc,
-            constraints_scale=constraints_scale,
-            dtype=dtype
-        )
-
-        # initial graph elements
-        with self.graph.as_default():
-
-            logger.debug("building models variables")
-            with tf.name_scope("model_vars"):
-                self.model_vars = ModelVars(
-                    dtype=dtype,
-                    init_a=init_a,
-                    init_b=init_b,
-                    constraints_loc=self.constraints_loc,
-                    constraints_scale=self.constraints_scale
-                )
-
-            # ### performance related settings
-            buffer_size = 4
-
-            with tf.name_scope("batched_data"):
-                logger.debug("building batched data model")
-                if provide_batched:
-                    self.batched_data_model = BatchedDataModelGraph(
-                        num_observations=self.num_observations,
-                        fetch_fn=fetch_fn,
-                        batch_size=batch_size,
-                        buffer_size=buffer_size,
-                        model_vars=self.model_vars,
-                        constraints_loc=self.constraints_loc,
-                        constraints_scale=self.constraints_scale,
-                        noise_model=noise_model,
-                        train_a=train_loc,
-                        train_b=train_scale,
-                        compute_fim=provide_fim,
-                        compute_hessian=provide_hessian,
-                        dtype=dtype
-                    )
-                else:
-                    self.batched_data_model = None
-
-            with tf.name_scope("full_data"):
-                logger.debug("building full data model")
-                # ### alternative definitions for custom observations:
-                sample_selection = tf.compat.v1.placeholder_with_default(
-                    tf.range(num_observations),
-                    shape=(None,),
-                    name="sample_selection"
-                )
-                self.full_data_model = FullDataModelGraph(
-                    num_observations=self.num_observations,
-                    sample_indices=sample_selection,
-                    fetch_fn=fetch_fn,
-                    batch_size=batch_size,
-                    model_vars=self.model_vars,
-                    constraints_loc=self.constraints_loc,
-                    constraints_scale=self.constraints_scale,
-                    noise_model=noise_model,
-                    train_a=train_loc,
-                    train_b=train_scale,
-                    compute_fim=provide_fim,
-                    compute_hessian=provide_hessian,
-                    dtype=dtype
-                )
-
-            logger.debug("building trainers")
-            self._run_trainer_init(
-                provide_optimizers=provide_optimizers,
-                train_loc=train_loc,
-                train_scale=train_scale,
-                dtype=dtype
-            )
-
-            # Define output metrics:
-            logger.debug("building outputs")
-            self._set_out_var(
-                feature_isnonzero=feature_isnonzero,
-                dtype=dtype
-            )
-            self.loss = self.full_data_model.loss_final
-            self.log_likelihood = self.full_data_model.log_likelihood_final
-            self.hessian = self.full_data_model.hessians_final
-            self.fisher_inv = tf.linalg.inv(-self.full_data_model.hessians_final)  # TODO switch for fim?
-            # Summary statistics on feature-wise model gradients:
-            self.gradients = tf.reduce_sum(tf.abs(self.full_data_model.neg_jac_final / num_observations), axis=1)
-
-        with tf.name_scope('summaries'):
-            if extended_summary:
-                tf.summary.histogram('a_var', self.model_vars.a_var)
-                tf.summary.histogram('b_var', self.model_vars.b_var)
-                tf.summary.scalar('loss', self.full_data_model.loss)
-                tf.summary.scalar('learning_rate', self.learning_rate)
-
-        self.saver = tf.compat.v1.train.Saver()
-        self.merged_summary = tf.compat.v1.summary.merge_all()
diff --git a/batchglm/train/tf1/base_glm_all/external.py b/batchglm/train/tf1/base_glm_all/external.py
deleted file mode 100644
index 0682a071..00000000
--- a/batchglm/train/tf1/base_glm_all/external.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import batchglm.data as data_utils
-
-import batchglm.train.tf1.train as train_utils
-from batchglm.train.tf1.base import TFEstimatorGraph, _TFEstimator
-from batchglm.train.tf1.base_glm import GradientGraphGLM, NewtonGraphGLM, TrainerGraphGLM, EstimatorGraphGLM, FullDataModelGraphGLM, BatchedDataModelGraphGLM, BasicModelGraphGLM
-from batchglm.train.tf1.base_glm import ProcessModelGLM, ModelVarsGLM, FIMGLM, HessiansGLM, JacobiansGLM, ReducableTensorsGLM
-
-from batchglm.models.base_glm import InputDataGLM, _ModelGLM, _EstimatorGLM
-
-import batchglm.train.tf1.ops as op_utils
-from batchglm.utils.linalg import groupwise_solve_lm
-from batchglm import pkg_constants
diff --git a/batchglm/train/tf1/base_glm_all/external_beta.py b/batchglm/train/tf1/base_glm_all/external_beta.py
deleted file mode 100644
index 93cdd974..00000000
--- a/batchglm/train/tf1/base_glm_all/external_beta.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from batchglm.train.tf1.glm_beta import EstimatorGraph
-from batchglm.train.tf1.glm_beta import BasicModelGraph, ModelVars, ProcessModel
-from batchglm.train.tf1.glm_beta import Hessians, FIM, Jacobians, ReducibleTensors
-
-from batchglm.models.glm_beta import InputDataGLM, Model
-from batchglm.models.glm_beta.utils import closedform_beta_glm_logitmean, closedform_beta_glm_logsamplesize
\ No newline at end of file
diff --git a/batchglm/train/tf1/base_glm_all/external_nb.py b/batchglm/train/tf1/base_glm_all/external_nb.py
deleted file mode 100644
index 97ec6bbe..00000000
--- a/batchglm/train/tf1/base_glm_all/external_nb.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from batchglm.train.tf1.glm_nb import EstimatorGraph
-from batchglm.train.tf1.glm_nb import BasicModelGraph, ModelVars, ProcessModel
-from batchglm.train.tf1.glm_nb import Hessians, FIM, Jacobians, ReducibleTensors
-
-from batchglm.models.glm_nb import InputDataGLM, Model
-from batchglm.models.glm_nb.utils import closedform_nb_glm_logmu, closedform_nb_glm_logphi
\ No newline at end of file
diff --git a/batchglm/train/tf1/base_glm_all/external_norm.py b/batchglm/train/tf1/base_glm_all/external_norm.py
deleted file mode 100644
index 3b577f14..00000000
--- a/batchglm/train/tf1/base_glm_all/external_norm.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from batchglm.train.tf1.glm_norm import EstimatorGraph
-from batchglm.train.tf1.glm_norm import BasicModelGraph, ModelVars, ProcessModel
-from batchglm.train.tf1.glm_norm import Hessians, FIM, Jacobians, ReducibleTensors
-
-from batchglm.models.glm_norm import InputDataGLM, Model
-from batchglm.models.glm_norm.utils import closedform_norm_glm_mean, closedform_norm_glm_logsd
\ No newline at end of file
diff --git a/batchglm/train/tf1/base_glm_all/fim.py b/batchglm/train/tf1/base_glm_all/fim.py
deleted file mode 100644
index 2126c0f3..00000000
--- a/batchglm/train/tf1/base_glm_all/fim.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import tensorflow as tf
-
-import logging
-
-from .external import FIMGLM
-
-logger = logging.getLogger(__name__)
-
-
-class FIMGLMALL(FIMGLM):
-    """
-    Compute the iteratively re-weighted least squares (IWLS or IRLS)
-    parameter updates for a negative binomial GLM.
-    """
-
-    def fim_a_analytic(
-            self,
-            model
-    ):
-        """
-        Compute the closed-form of the base_glm_all model hessian
-        by evaluating its terms grouped by observations.
-
-        Has three sub-functions which built the specific blocks of the hessian
-        and one sub-function which concatenates the blocks into a full hessian.
-        """
-
-        def _a_byobs(model):
-            """
-            Compute the mean model diagonal block of the
-            closed form hessian of base_glm_all model by observation across features
-            for a batch of observations.
-
-            :param X: tf1.tensor observations x features
-                Observation by observation and feature.
-            :param model_loc: tf1.tensor observations x features
-                Value of mean model by observation and feature.
-            :param model_scale: tf1.tensor observations x features
-                Value of dispersion model by observation and feature.
-            """
-            W = self._weight_fim_aa(  # [observations x features]
-                loc=model.model_loc,
-                scale=model.model_scale
-            )
-            # The computation of the hessian block requires two outer products between
-            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
-            # The resulting tensor is observations x features x coefficients x coefficients which
-            # is too large too store in memory in most cases. However, the full 4D tensor is never
-            # actually needed but only its marginal across features, the final hessian block shape.
-            # Here, we use the einsum to efficiently perform the two outer products and the marginalisation.
-            if self.constraints_loc is not None:
-                XH = tf.matmul(model.design_loc, self.constraints_loc)
-            else:
-                XH = model.design_loc
-
-            fim = tf.einsum('ofc,od->fcd',
-                            tf.einsum('of,oc->ofc', W, XH),
-                            XH)
-            return fim
-
-        if self.compute_fim_a:
-            fim_a = _a_byobs(model=model)
-        else:
-            fim_a = tf.zeros((), dtype=self.dtype)
-
-        return fim_a
-
-    def fim_b_analytic(
-            self,
-            model
-    ):
-        """
-        Compute the closed-form of the base_glm_all model hessian
-        by evaluating its terms grouped by observations.
-
-        Has three sub-functions which built the specific blocks of the hessian
-        and one sub-function which concatenates the blocks into a full hessian.
-        """
-
-        def _b_byobs(model):
-            """
-            Compute the dispersion model diagonal block of the
-            closed form hessian of base_glm_all model by observation across features.
-            """
-            W = self._weight_fim_bb(  # [observations=1 x features]
-                loc=model.model_loc,
-                scale=model.model_scale
-            )
-            # The computation of the hessian block requires two outer products between
-            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
-            # The resulting tensor is observations x features x coefficients x coefficients which
-            # is too large too store in memory in most cases. However, the full 4D tensor is never
-            # actually needed but only its marginal across features, the final hessian block shape.
-            # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation.
-            if self.constraints_scale is not None:
-                XH = tf.matmul(model.design_scale, self.constraints_scale)
-            else:
-                XH = model.design_scale
-
-            fim = tf.einsum('ofc,od->fcd',
-                            tf.einsum('of,oc->ofc', W, XH),
-                            XH)
-            return fim
-
-        # The full fisher information matrix is block-diagonal with the cross-model
-        # blocks all zero. Accordingly, mean and dispersion model updates can be
-        # treated independently and the full fisher information matrix is never required.
-        # Here, the non-zero model-wise diagonal blocks are computed and returned
-        # as a dictionary. The according score function vectors are also returned as a dictionary.
-        if self.compute_fim_b:
-            fim_b = _b_byobs(model=model)
-        else:
-            fim_b = tf.zeros((), dtype=self.dtype)
-
-        return fim_b
diff --git a/batchglm/train/tf1/base_glm_all/hessians.py b/batchglm/train/tf1/base_glm_all/hessians.py
deleted file mode 100644
index f6fcba18..00000000
--- a/batchglm/train/tf1/base_glm_all/hessians.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import logging
-
-import tensorflow as tf
-
-from .external import pkg_constants
-from .external import HessiansGLM
-
-logger = logging.getLogger(__name__)
-
-
-class HessianGLMALL(HessiansGLM):
-    """
-    Compute the Hessian matrix for a GLM by gene using gradients from tensorflow.
-    """
-
-    def hessian_analytic(
-            self,
-            model
-    ) -> tf.Tensor:
-        """
-        Compute the closed-form of the base_glm_all model hessian
-        by evaluating its terms grouped by observations.
-
-        Has three sub-functions which built the specific blocks of the hessian
-        and one sub-function which concatenates the blocks into a full hessian.
-        """
-
-        def _aa_byobs_batched(model):
-            """
-            Compute the mean model diagonal block of the
-            closed form hessian of base_glm_all model by observation across features
-            for a batch of observations.
-            """
-            W = self._weight_hessian_aa(  # [observations x features]
-                X=model.X,
-                loc=model.model_loc,
-                scale=model.model_scale,
-            )
-            # The computation of the hessian block requires two outer products between
-            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
-            # The resulting tensor is observations x features x coefficients x coefficients which
-            # is too large too store in memory in most cases. However, the full 4D tensor is never
-            # actually needed but only its marginal across features, the final hessian block shape.
-            # Here, we use the einsum to efficiently perform the two outer products and the marginalisation.
-            if self.constraints_loc is not None:
-                XH = tf.matmul(model.design_loc, model.constraints_loc)
-            else:
-                XH = model.design_loc
-
-            Hblock = tf.einsum('ofc,od->fcd',
-                               tf.einsum('of,oc->ofc', W, XH),
-                               XH)
-            return Hblock
-
-        def _bb_byobs_batched(model):
-            """
-            Compute the dispersion model diagonal block of the
-            closed form hessian of base_glm_all model by observation across features.
-            """
-            W = self._weight_hessian_bb(  # [observations=1 x features]
-                X=model.X,
-                loc=model.model_loc,
-                scale=model.model_scale,
-            )
-            # The computation of the hessian block requires two outer products between
-            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
-            # The resulting tensor is observations x features x coefficients x coefficients which
-            # is too large too store in memory in most cases. However, the full 4D tensor is never
-            # actually needed but only its marginal across features, the final hessian block shape.
-            # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation.
-            if self.constraints_scale is not None:
-                XH = tf.matmul(model.design_scale, model.constraints_scale)
-            else:
-                XH = model.design_scale
-
-            Hblock = tf.einsum('ofc,od->fcd',
-                               tf.einsum('of,oc->ofc', W, XH),
-                               XH)
-            return Hblock
-
-        def _ab_byobs_batched(model):
-            """
-            Compute the mean-dispersion model off-diagonal block of the
-            closed form hessian of base_glm_all model by observastion across features.
-
-            Note that there are two blocks of the same size which can
-            be compute from each other with a transpose operation as
-            the hessian is symmetric.
-            """
-            W = self._weight_hessian_ab(  # [observations=1 x features]
-                X=model.X,
-                loc=model.model_loc,
-                scale=model.model_scale,
-            )
-            # The computation of the hessian block requires two outer products between
-            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
-            # The resulting tensor is observations x features x coefficients x coefficients which
-            # is too large too store in memory in most cases. However, the full 4D tensor is never
-            # actually needed but only its marginal across features, the final hessian block shape.
-            # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation.
-            if self.constraints_loc is not None:
-                XHloc = tf.matmul(model.design_loc, model.constraints_loc)
-            else:
-                XHloc = model.design_loc
-
-            if self.constraints_scale is not None:
-                XHscale = tf.matmul(model.design_scale, model.constraints_scale)
-            else:
-                XHscale = model.design_scale
-
-            Hblock = tf.einsum('ofc,od->fcd',
-                               tf.einsum('of,oc->ofc', W, XHloc),
-                               XHscale)
-            return Hblock
-
-        if self.compute_a and self.compute_b:
-            H_aa = _aa_byobs_batched(model=model)
-            H_bb = _bb_byobs_batched(model=model)
-            H_ab = _ab_byobs_batched(model=model)
-            H_ba = tf.transpose(H_ab, perm=[0, 2, 1])
-            H = tf.concat(
-                [tf.concat([H_aa, H_ab], axis=2),
-                 tf.concat([H_ba, H_bb], axis=2)],
-                axis=1
-            )
-        elif self.compute_a and not self.compute_b:
-            H = _aa_byobs_batched(model=model)
-        elif not self.compute_a and self.compute_b:
-            H = _bb_byobs_batched(model=model)
-        else:
-            H = tf.zeros((), dtype=self.dtype)
-
-        return H
-
-    def hessian_tf(
-            self,
-            model
-    ) -> tf.Tensor:
-        """
-        Compute hessians via tf1.gradients for all gene-wise in parallel.
-        """
-        if self.compute_a and self.compute_b:
-            var_shape = tf.shape(self.model_vars.params)
-            var = self.model_vars.params
-        elif self.compute_a and not self.compute_b:
-            var_shape = tf.shape(self.model_vars.a_var)
-            var = self.model_vars.a_var
-        elif not self.compute_a and self.compute_b:
-            var_shape = tf.shape(self.model_vars.b_var)
-            var = self.model_vars.b_var
-
-        if self.compute_a or self.compute_b:
-            # Compute first order derivatives as first step to get second order derivatives.
-            first_der = tf.gradients(model.log_likelihood, var)[0]
-
-            # Note on error comment below: The arguments that cause the error, infer_shape and element_shape,
-            # are not necessary for this code but would provide an extra layer of stability as all
-            # elements of the array have the same shape.
-            loop_vars = [
-                tf.constant(0, tf.int32),  # iteration counter
-                tf.TensorArray(  # hessian slices [:,:,j]
-                    dtype=var.dtype,
-                    size=var_shape[0],
-                    clear_after_read=False
-                    #infer_shape=True,  # TODO tf1>=2.0: this causes error related to eager execution in tf1.12
-                    #element_shape=var_shape
-                )
-            ]
-
-            # Compute second order derivatives based on parameter-wise slices of the tensor of first order derivatives.
-            _, h_tensor_array = tf.while_loop(
-                cond=lambda i, _: i < var_shape[0],
-                body=lambda i, result: (
-                    i + 1,
-                    result.write(
-                        index=i,
-                        value=tf.gradients(first_der[i, :], var)[0]
-                    )
-                ),
-                loop_vars=loop_vars,
-                return_same_structure=True
-            )
-
-            # h_tensor_array is a TensorArray, reshape this into a tensor so that it can be used
-            # in down-stream computation graphs.
-            h = tf.transpose(tf.reshape(
-                h_tensor_array.stack(),
-                tf.stack((var_shape[0], var_shape[0], var_shape[1]))
-            ), perm=[2, 1, 0])
-        else:
-            h = tf.zeros((), dtype=self.dtype)
-
-        return h
\ No newline at end of file
diff --git a/batchglm/train/tf1/base_glm_all/jacobians.py b/batchglm/train/tf1/base_glm_all/jacobians.py
deleted file mode 100644
index b7be31e7..00000000
--- a/batchglm/train/tf1/base_glm_all/jacobians.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import logging
-
-import tensorflow as tf
-
-from .external import JacobiansGLM
-
-logger = logging.getLogger(__name__)
-
-
-class JacobiansGLMALL(JacobiansGLM):
-    """
-    Compute the Jacobian matrix for a GLM using gradients from tensorflow.
-    """
-
-    def jac_analytic(
-            self,
-            model
-    ) -> tf.Tensor:
-        """
-        Compute the closed-form of the base_glm_all model jacobian
-        by evalutating its terms grouped by observations.
-        """
-
-        def _a_byobs(X, design_loc, loc, scale):
-            """
-            Compute the mean model block of the jacobian.
-
-            :param X: tf1.tensor observations x features
-                Observation by observation and feature.
-            :param model_loc: tf1.tensor observations x features
-                Value of mean model by observation and feature.
-            :param model_scale: tf1.tensor observations x features
-                Value of dispersion model by observation and feature.
-            :return Jblock: tf1.tensor features x coefficients
-                Block of jacobian.
-            """
-            W = self._weights_jac_a(X=X, loc=loc, scale=scale)  # [observations, features]
-            if self.constraints_loc is not None:
-                XH = tf.matmul(design_loc, self.constraints_loc)
-            else:
-                XH = design_loc
-
-            Jblock = tf.matmul(tf.transpose(W), XH)  # [features, coefficients]
-            return Jblock
-
-        def _b_byobs(X, design_scale, loc, scale):
-            """
-            Compute the dispersion model block of the jacobian.
-            """
-            W = self._weights_jac_b(X=X, loc=loc, scale=scale)  # [observations, features]
-            if self.constraints_scale is not None:
-                XH = tf.matmul(design_scale, self.constraints_scale)
-            else:
-                XH = design_scale
-
-            Jblock = tf.matmul(tf.transpose(W), XH)  # [features, coefficients]
-            return Jblock
-
-        if self.compute_a and self.compute_b:
-            J_a = _a_byobs(X=model.X, design_loc=model.design_loc, loc=model.model_loc, scale=model.model_scale)
-            J_b = _b_byobs(X=model.X, design_scale=model.design_scale, loc=model.model_loc, scale=model.model_scale)
-            J = tf.concat([J_a, J_b], axis=1)
-        elif self.compute_a and not self.compute_b:
-            J = _a_byobs(X=model.X, design_loc=model.design_loc, loc=model.model_loc, scale=model.model_scale)
-        elif not self.compute_a and self.compute_b:
-            J = _b_byobs(X=model.X, design_scale=model.design_scale, loc=model.model_loc, scale=model.model_scale)
-        else:
-            J = tf.zeros((), dtype=self.dtype)
-
-        return J
-
-    def jac_tf(
-            self,
-            model
-    ) -> tf.Tensor:
-        """
-        Compute the Jacobian matrix for a GLM using gradients from tensorflow.
-        """
-        def _jac():
-            J = tf.gradients(model.log_likelihood, self.model_vars.params)[0]
-            J = tf.transpose(J)
-            return J
-
-        def _jac_a():
-            J_a = tf.gradients(model.log_likelihood, self.model_vars.a_var)[0]
-            J_a = tf.transpose(J_a)
-            return J_a
-
-        def _jac_b():
-            J_b = tf.gradients(model.log_likelihood, self.model_vars.b_var)[0]
-            J_b = tf.transpose(J_b)
-            return J_b
-
-        if self.compute_a and self.compute_b:
-            J = _jac()
-        elif self.compute_a and not self.compute_b:
-            J = _jac_a()
-        elif not self.compute_a and self.compute_b:
-            J = _jac_b()
-        else:
-            J = tf.zeros((), dtype=self.dtype)
-
-        return J
diff --git a/batchglm/train/tf1/base_glm_all/reducible_tensors.py b/batchglm/train/tf1/base_glm_all/reducible_tensors.py
deleted file mode 100644
index 2263c0a2..00000000
--- a/batchglm/train/tf1/base_glm_all/reducible_tensors.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import logging
-
-import tensorflow as tf
-
-from .external import ReducableTensorsGLM
-
-logger = logging.getLogger("batchglm")
-
-
-class ReducableTensorsGLMALL(ReducableTensorsGLM):
-    """
-    """
-
-    def assemble_tensors(self, idx, data):
-        """
-        Assemble jacobian of a batch of observations across all features.
-
-        This function runs the data batch (an observation) through the
-        model graph and calls the wrappers that compute the
-        individual closed forms of the jacobian.
-
-        :param idx: Indices of observations.
-        :param data: tuple
-            Containing the following parameters:
-            - X: tf1.tensor observations x features
-                Observation by observation and feature.
-            - size_factors: tf1.tensor observations x features
-                Model size factors by observation and feature.
-            - params: tf1.tensor features x coefficients
-                Estimated model variables.
-        :return J: tf1.tensor features x coefficients
-            Jacobian evaluated on a single observation, provided in data.
-        """
-        if self.noise_model == "nb":
-            from .external_nb import BasicModelGraph
-        elif self.noise_model == "norm":
-            from .external_norm import BasicModelGraph
-        elif self.noise_model == "beta":
-            from .external_beta import BasicModelGraph
-        else:
-            raise ValueError("noise model %s was not recognized" % self.noise_model)
-
-        X, design_loc, design_scale, size_factors = data
-
-        model = BasicModelGraph(
-            X=X,
-            design_loc=design_loc,
-            design_scale=design_scale,
-            constraints_loc=self.constraints_loc,
-            constraints_scale=self.constraints_scale,
-            a_var=self.model_vars.a_var,
-            b_var=self.model_vars.b_var,
-            dtype=self.model_vars.dtype,
-            size_factors=size_factors
-        )
-        dtype = model.dtype
-
-        if self.compute_jac:
-            if self.mode_jac == "analytic":
-                jac = self.jac_analytic(model=model)
-            elif self.mode_jac == "tf1":
-                jac = self.jac_tf(model=model)
-            else:
-                raise ValueError("mode_jac %s not recognized" % self.mode_jac)
-        else:
-            jac = tf.zeros((), dtype=dtype)
-
-        if self.compute_hessian:
-            if self.mode_hessian == "analytic":
-                hessian = self.hessian_analytic(model=model)
-            elif self.mode_hessian == "tf1":
-                hessian = self.hessian_tf(model=model)
-            else:
-                raise ValueError("mode_hessian %s not recognized" % self.mode_hessian)
-        else:
-            hessian = tf.zeros((), dtype=dtype)
-
-        if self.compute_fim_a:
-            if self.mode_fim == "analytic":
-                fim_a = self.fim_a_analytic(model=model)
-            else:
-                raise ValueError("mode_fim %s not recognized" % self.mode_fim)
-        else:
-            fim_a = tf.zeros((), dtype=dtype)
-
-        if self.compute_fim_b:
-            if self.mode_fim == "analytic":
-                fim_b = self.fim_b_analytic(model=model)
-            else:
-                raise ValueError("mode_fim %s not recognized" % self.mode_fim)
-        else:
-            fim_b = tf.zeros((), dtype=dtype)
-
-        if self.compute_ll:
-            ll = model.log_likelihood
-        else:
-            ll = tf.zeros((), dtype=dtype)
-
-        return [jac, hessian, fim_a, fim_b, ll]
diff --git a/batchglm/train/tf1/external.py b/batchglm/train/tf1/external.py
deleted file mode 100644
index 0b70405a..00000000
--- a/batchglm/train/tf1/external.py
+++ /dev/null
@@ -1 +0,0 @@
-from batchglm import pkg_constants
\ No newline at end of file
diff --git a/batchglm/train/tf1/glm_beta/__init__.py b/batchglm/train/tf1/glm_beta/__init__.py
deleted file mode 100644
index 4db081bb..00000000
--- a/batchglm/train/tf1/glm_beta/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .estimator import Estimator
-from .estimator_graph import EstimatorGraph
-from .model import BasicModelGraph, ModelVars, ProcessModel
-from .hessians import Hessians
-from .fim import FIM
-from .jacobians import Jacobians
-from .reducible_tensors import ReducibleTensors
diff --git a/batchglm/train/tf1/glm_beta/estimator.py b/batchglm/train/tf1/glm_beta/estimator.py
deleted file mode 100644
index 6cd96878..00000000
--- a/batchglm/train/tf1/glm_beta/estimator.py
+++ /dev/null
@@ -1,291 +0,0 @@
-import logging
-from typing import Union
-
-import numpy as np
-import tensorflow as tf
-
-from .external import TFEstimatorGLM, InputDataGLM, Model
-from .external import closedform_beta_glm_logitmean, closedform_beta_glm_logsamplesize
-from .estimator_graph import EstimatorGraph
-from .model import ProcessModel
-from .training_strategies import TrainingStrategies
-
-
-class Estimator(TFEstimatorGLM, ProcessModel):
-    """
-    Estimator for Generalized Linear Models (GLMs) with beta distributed noise.
-    Uses a logit linker function for loc and log linker function for scale.
-    """
-
-    def __init__(
-            self,
-            input_data: InputDataGLM,
-            batch_size: int = 512,
-            graph: tf.Graph = None,
-            init_model: Model = None,
-            init_a: Union[np.ndarray, str] = "AUTO",
-            init_b: Union[np.ndarray, str] = "AUTO",
-            quick_scale: bool = False,
-            model: EstimatorGraph = None,
-            provide_optimizers: dict = {
-                "gd": True,
-                "adam": True,
-                "adagrad": True,
-                "rmsprop": True,
-                "nr": True,
-                "nr_tr": True,
-                "irls": False,
-                "irls_gd": False,
-                "irls_tr": False,
-                "irls_gd_tr": False,
-            },
-            provide_batched: bool = False,
-            provide_fim: bool = False,
-            provide_hessian: bool = False,
-            optim_algos: list = [],
-            extended_summary=False,
-            dtype="float64"
-    ):
-        """
-        Performs initialisation and creates a new estimator.
-
-        :param input_data: InputData
-            The input data
-        :param batch_size: int
-            Size of mini-batches used.
-        :param graph: (optional) tf1.Graph
-        :param init_model: (optional)
-            If provided, this model will be used to initialize this Estimator.
-        :param init_a: (Optional)
-            Low-level initial values for a. Can be:
-
-            - str:
-                * "auto": automatically choose best initialization
-                * "random": initialize with random values
-                * "standard": initialize intercept with observed mean
-                * "init_model": initialize with another model (see `ìnit_model` parameter)
-                * "closed_form": try to initialize with closed form
-            - np.ndarray: direct initialization of 'a'
-        :param init_b: (Optional)
-            Low-level initial values for b. Can be:
-
-            - str:
-                * "auto": automatically choose best initialization
-                * "random": initialize with random values
-                * "standard": initialize with zeros
-                * "init_model": initialize with another model (see `ìnit_model` parameter)
-                * "closed_form": try to initialize with closed form
-            - np.ndarray: direct initialization of 'b'
-        :param quick_scale: bool
-            Whether `scale` will be fitted faster and maybe less accurate.
-            Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
-        :param model: EstimatorGraph
-            EstimatorGraph to use. Basically for debugging.
-        :param provide_optimizers:
-
-            E.g.    {"gd": False, "adam": False, "adagrad": False, "rmsprop": False,
-                    "nr": False, "nr_tr": True,
-                    "irls": False, "irls_gd": False, "irls_tr": False, "irls_gd_tr": False}
-        :param provide_batched: bool
-            Whether mini-batched optimizers should be provided.
-        :param provide_fim: Whether to compute fisher information matrix during training
-            Either supply provide_fim and provide_hessian or optim_algos.
-        :param provide_hessian: Whether to compute hessians during training
-            Either supply provide_fim and provide_hessian or optim_algos.
-        :param optim_algos: Algorithms that you want to use on this object. Depending on that,
-            the hessian and/or fisher information matrix are computed.
-            Either supply provide_fim and provide_hessian or optim_algos.
-        :param extended_summary: Include detailed information in the summaries.
-            Will increase runtime of summary writer, use only for debugging.
-        :param dtype: Precision used in tensorflow.
-        """
-        self.TrainingStrategies = TrainingStrategies
-
-        self._input_data = input_data
-        self._train_loc = True
-        self._train_scale = True
-
-        (init_a, init_b) = self.init_par(
-            input_data=input_data,
-            init_a=init_a,
-            init_b=init_b,
-            init_model=init_model
-        )
-        init_a = init_a.astype(dtype)
-        init_b = init_b.astype(dtype)
-        if quick_scale:
-            self._train_scale = False
-
-        if len(optim_algos) > 0:
-            if np.any([x.lower() in ["nr", "nr_tr"] for x in optim_algos]):
-                provide_hessian = True
-            if np.any([x.lower() in ["irls", "irls_tr", "irls_gd", "irls_gd_tr"] for x in optim_algos]):
-                provide_fim = True
-
-        TFEstimatorGLM.__init__(
-            self=self,
-            input_data=input_data,
-            batch_size=batch_size,
-            graph=graph,
-            init_a=init_a,
-            init_b=init_b,
-            model=model,
-            provide_optimizers=provide_optimizers,
-            provide_batched=provide_batched,
-            provide_fim=provide_fim,
-            provide_hessian=provide_hessian,
-            extended_summary=extended_summary,
-            noise_model="beta",
-            dtype=dtype
-        )
-
-    def get_model_container(
-            self,
-            input_data
-    ):
-        return Model(input_data=input_data)
-
-    def init_par(
-            self,
-            input_data,
-            init_a,
-            init_b,
-            init_model
-    ):
-        r"""
-        standard:
-        Only initialise intercept and keep other coefficients as zero.
-
-        closed-form:
-        Initialize with Maximum Likelihood / Maximum of Momentum estimators
-
-        Idea:
-        $$
-            \theta &= f(x) \\
-            \Rightarrow f^{-1}(\theta) &= x \\
-                &= (D \cdot D^{+}) \cdot x \\
-                &= D \cdot (D^{+} \cdot x) \\
-                &= D \cdot x' = f^{-1}(\theta)
-        $$
-        """
-
-        if init_model is None:
-            groupwise_means = None
-            init_a_str = None
-            if isinstance(init_a, str):
-                init_a_str = init_a.lower()
-                # Chose option if auto was chosen
-                if init_a.lower() == "auto":
-                    init_a = "closed_form"
-
-                if init_a.lower() == "closed_form":
-                    groupwise_means, init_a, rmsd_a = closedform_beta_glm_logitmean(
-                        x=input_data.x,
-                        design_loc=input_data.design_loc,
-                        constraints_loc=input_data.constraints_loc,
-                        size_factors=input_data.size_factors,
-                        link_fn=lambda mean: np.log(
-                            1/(1/self.np_clip_param(mean, "mean")-1)
-                        )
-                    )
-
-                    # train mu, if the closed-form solution is inaccurate
-                    self._train_loc = not (np.all(rmsd_a == 0) or rmsd_a.size == 0)
-
-                    logging.getLogger("batchglm").debug("Using closed-form MME initialization for mean")
-                elif init_a.lower() == "standard":
-                    overall_means = np.mean(input_data.x, axis=0)
-                    overall_means = self.np_clip_param(overall_means, "mean")
-
-                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
-                    init_a[0, :] = np.log(overall_means/(1-overall_means))
-                    self._train_loc = True
-
-                    logging.getLogger("batchglm").debug("Using standard initialization for mean")
-                elif init_a.lower() == "all_zero":
-                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
-                    self._train_loc = True
-
-                    logging.getLogger("batchglm").debug("Using all_zero initialization for mean")
-                else:
-                    raise ValueError("init_a string %s not recognized" % init_a)
-                logging.getLogger("batchglm").debug("Should train mean: %s", self._train_loc)
-            if isinstance(init_b, str):
-                if init_b.lower() == "auto":
-                    init_b = "standard"
-
-                if init_b.lower() == "standard":
-                    groupwise_scales, init_b_intercept, rmsd_b = closedform_beta_glm_logsamplesize(
-                        x=input_data.x,
-                        design_scale=input_data.design_scale[:, [0]],
-                        constraints=input_data.constraints_scale[[0], :][:, [0]],
-                        size_factors=input_data.size_factors,
-                        groupwise_means=None,
-                        link_fn=lambda samplesize: np.log(self.np_clip_param(samplesize, "samplesize"))
-                    )
-                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
-                    init_b[0, :] = init_b_intercept
-
-                    logging.getLogger("batchglm").debug("Using standard-form MME initialization for dispersion")
-                elif init_b.lower() == "closed_form":
-                    dmats_unequal = False
-                    if input_data.num_design_loc_params == input_data.num_design_scale_params:
-                        if np.any(input_data.design_loc != input_data.design_scale):
-                            dmats_unequal = True
-
-                    inits_unequal = False
-                    if init_a_str is not None:
-                        if init_a_str != init_b:
-                            inits_unequal = True
-
-                    if inits_unequal or dmats_unequal:
-                        raise ValueError("cannot use closed_form init for scale model " +
-                                         "if scale model differs from loc model")
-
-                    groupwise_scales, init_b, rmsd_b = closedform_beta_glm_logsamplesize(
-                        x=input_data.x,
-                        design_scale=input_data.design_scale,
-                        constraints=input_data.constraints_scale,
-                        size_factors=input_data.size_factors,
-                        groupwise_means=groupwise_means,
-                        link_fn=lambda samplesize: np.log(self.np_clip_param(samplesize, "samplesize"))
-                    )
-
-                    logging.getLogger("batchglm").debug("Using closed-form MME initialization for dispersion")
-                elif init_b.lower() == "all_zero":
-                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
-
-                    logging.getLogger("batchglm").debug("Using standard initialization for dispersion")
-                else:
-                    raise ValueError("init_b string %s not recognized" % init_b)
-                logging.getLogger("batchglm").debug("Should train r: %s", self._train_scale)
-        else:
-            # Locations model:
-            if isinstance(init_a, str) and (init_a.lower() == "auto" or init_a.lower() == "init_model"):
-                my_loc_names = set(input_data.loc_names)
-                my_loc_names = my_loc_names.intersection(set(init_model.input_data.loc_names))
-
-                init_loc = np.zeros([input_data.num_loc_params, input_data.num_features])
-                for parm in my_loc_names:
-                    init_idx = np.where(init_model.input_data.loc_names == parm)[0]
-                    my_idx = np.where(input_data.loc_names == parm)[0]
-                    init_loc[my_idx] = init_model.a_var[init_idx]
-
-                init_a = init_loc
-                logging.getLogger("batchglm").debug("Using initialization based on input model for mean")
-
-            # Scale model:
-            if isinstance(init_b, str) and (init_b.lower() == "auto" or init_b.lower() == "init_model"):
-                my_scale_names = set(input_data.scale_names)
-                my_scale_names = my_scale_names.intersection(init_model.input_data.scale_names)
-
-                init_scale = np.zeros([input_data.num_scale_params, input_data.num_features])
-                for parm in my_scale_names:
-                    init_idx = np.where(init_model.input_data.scale_names == parm)[0]
-                    my_idx = np.where(input_data.scale_names == parm)[0]
-                    init_scale[my_idx] = init_model.b_var[init_idx]
-
-                init_b = init_scale
-                logging.getLogger("batchglm").debug("Using initialization based on input model for dispersion")
-
-        return init_a, init_b
diff --git a/batchglm/train/tf1/glm_beta/estimator_graph.py b/batchglm/train/tf1/glm_beta/estimator_graph.py
deleted file mode 100644
index 8e609600..00000000
--- a/batchglm/train/tf1/glm_beta/estimator_graph.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import logging
-
-from .model import ProcessModel
-from .external import EstimatorGraphAll
-
-logger = logging.getLogger(__name__)
-
-
-class EstimatorGraph(ProcessModel, EstimatorGraphAll):
-    """
-    Full class.
-    """
diff --git a/batchglm/train/tf1/glm_beta/external.py b/batchglm/train/tf1/glm_beta/external.py
deleted file mode 100644
index d24db22b..00000000
--- a/batchglm/train/tf1/glm_beta/external.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import batchglm.data as data_utils
-
-from batchglm.models.glm_beta import _EstimatorGLM, InputDataGLM, Model
-from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale
-from batchglm.models.glm_beta.utils import closedform_beta_glm_logitmean, closedform_beta_glm_logsamplesize
-
-import batchglm.train.tf1.ops as op_utils
-import batchglm.train.tf1.train as train_utils
-from batchglm.train.tf1.base import TFEstimatorGraph
-
-from batchglm.train.tf1.base_glm import GradientGraphGLM, NewtonGraphGLM, TrainerGraphGLM, EstimatorGraphGLM, FullDataModelGraphGLM, BasicModelGraphGLM
-from batchglm.train.tf1.base_glm import ProcessModelGLM, ModelVarsGLM
-from batchglm.train.tf1.base_glm import HessiansGLM, FIMGLM, JacobiansGLM
-
-from batchglm.train.tf1.base_glm_all import TFEstimatorGLM, EstimatorGraphAll, FIMGLMALL, HessianGLMALL, JacobiansGLMALL, ReducableTensorsGLMALL
-
-from batchglm.utils.linalg import groupwise_solve_lm
-from batchglm import pkg_constants
diff --git a/batchglm/train/tf1/glm_beta/fim.py b/batchglm/train/tf1/glm_beta/fim.py
deleted file mode 100644
index e23b0a94..00000000
--- a/batchglm/train/tf1/glm_beta/fim.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import tensorflow as tf
-
-import logging
-
-from .external import FIMGLMALL
-
-logger = logging.getLogger(__name__)
-
-
-class FIM(FIMGLMALL):
-    # No Fisher Information Matrices due to unsolvable E[log(X)]
-
-    def _weight_fim_aa(
-            self,
-            loc,
-            scale
-    ):
-        assert False, "not implemented"
-
-    def _weight_fim_bb(
-            self,
-            loc,
-            scale
-    ):
-        assert False, "not implemented"
diff --git a/batchglm/train/tf1/glm_beta/hessians.py b/batchglm/train/tf1/glm_beta/hessians.py
deleted file mode 100644
index 73ca76e5..00000000
--- a/batchglm/train/tf1/glm_beta/hessians.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import tensorflow as tf
-
-import logging
-
-from .external import HessianGLMALL
-
-logger = logging.getLogger(__name__)
-
-
-class Hessians(HessianGLMALL):
-
-    def _weight_hessian_aa(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        one_minus_loc = 1 - loc
-        loc_times_scale = loc * scale
-        one_minus_loc_times_scale = one_minus_loc * scale
-
-        if isinstance(X, tf.SparseTensor):
-            # Using the dense matrix  of the location model to serve the correct shapes for the sparse X.
-            const1 = tf.sparse_add(tf.zeros_like(loc), X).__div__(-tf.sparse.add(X, -tf.ones_like(loc)))
-            # Adding tf1.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below,
-            # to_dense does not work.
-        else:
-            const1 = tf.log(X / (tf.ones_like(X) - X))
-
-        const2 = (1 - 2 * loc) * (- tf.digamma(loc_times_scale) + tf.digamma(one_minus_loc_times_scale) + const1)
-        const3 = loc * one_minus_loc_times_scale * (- tf.polygamma(tf.ones_like(loc), loc_times_scale) - tf.polygamma(tf.ones_like(loc), one_minus_loc_times_scale))
-        const = loc * one_minus_loc_times_scale * (const2 + const3)
-        return const
-
-    def _weight_hessian_ab(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        one_minus_loc = 1 - loc
-        loc_times_scale = loc * scale
-        one_minus_loc_times_scale = one_minus_loc * scale
-        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
-
-        if isinstance(X, tf.SparseTensor):
-            # Using the dense matrix  of the location model to serve the correct shapes for the sparse X.
-            const1 = tf.sparse_add(tf.zeros_like(loc), X).__div__(-tf.sparse.add(X, -tf.ones_like(loc)))
-            # Adding tf1.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below,
-            # to_dense does not work.
-        else:
-            const1 = tf.log(X / (1 - X))
-
-        const2 = - tf.digamma(loc_times_scale) + tf.digamma(one_minus_loc_times_scale) + const1
-        const3 = scale * (- tf.polygamma(scalar_one, loc_times_scale) * loc + one_minus_loc * tf.polygamma(scalar_one, one_minus_loc_times_scale))
-
-        const = loc * one_minus_loc_times_scale * (const2 + const3)
-
-        return const
-
-    def _weight_hessian_bb(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        one_minus_loc = 1 - loc
-        loc_times_scale = loc * scale
-        one_minus_loc_times_scale = one_minus_loc * scale
-        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
-
-        if isinstance(X, tf.SparseTensor):
-            # Using the dense matrix  of the location model to serve the correct shapes for the sparse X.
-            const1 = tf.sparse_add(tf.zeros_like(loc), X).__div__(-tf.sparse.add(X, -tf.ones_like(loc)))
-            # Adding tf1.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below,
-            # to_dense does not work.
-            const2 = loc * (tf.log(tf.sparse_add(tf.zeros_like(loc), X)) - tf.digamma(loc_times_scale)) \
-                     - one_minus_loc * (tf.digamma(one_minus_loc_times_scale) + tf.log(const1)) \
-                     + tf.digamma(scale)
-        else:
-            const1 = tf.log(X / (1 - X))
-            const2 = loc * (tf.log(X) - tf.digamma(loc_times_scale))\
-                     - one_minus_loc * (tf.digamma(one_minus_loc_times_scale) + tf.log(const1)) \
-                     + tf.digamma(scale)
-        const3 = scale * (- tf.square(loc) * tf.polygamma(scalar_one, loc_times_scale)\
-                          + tf.polygamma(scalar_one, scale)\
-                          - tf.polygamma(scalar_one, one_minus_loc_times_scale) * tf.square(one_minus_loc))
-        const = scale * (const2 + const3)
-
-        return const
-
-
diff --git a/batchglm/train/tf1/glm_beta/jacobians.py b/batchglm/train/tf1/glm_beta/jacobians.py
deleted file mode 100644
index d599636d..00000000
--- a/batchglm/train/tf1/glm_beta/jacobians.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import logging
-
-import tensorflow as tf
-
-from .external import JacobiansGLMALL
-
-logger = logging.getLogger(__name__)
-
-
-class Jacobians(JacobiansGLMALL):
-
-    def _weights_jac_a(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        one_minus_loc = 1 - loc
-        if isinstance(X, tf.SparseTensor):
-            const1 = tf.log(tf.sparse_add(tf.zeros_like(loc), X).__div__(-tf.sparse.add(X, -tf.ones_like(loc))))
-        else:
-            const1 = tf.log(X/(1-X))
-        const2 = - tf.digamma(loc*scale) + tf.digamma(one_minus_loc*scale) + const1
-        const = const2 * scale * loc * one_minus_loc
-        return const
-
-    def _weights_jac_b(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        if isinstance(X, tf.SparseTensor):
-            one_minus_X = - tf.sparse.add(X, -tf.ones_like(loc))
-        else:
-            one_minus_X = 1 - X
-        one_minus_loc = 1 - loc
-        const = scale * (tf.digamma(scale) - tf.digamma(loc*scale) * loc - tf.digamma(one_minus_loc*scale) * one_minus_loc\
-            + loc * tf.log(one_minus_X) + one_minus_loc * tf.log(one_minus_X))
-        return const
diff --git a/batchglm/train/tf1/glm_beta/model.py b/batchglm/train/tf1/glm_beta/model.py
deleted file mode 100644
index 477747c3..00000000
--- a/batchglm/train/tf1/glm_beta/model.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import logging
-
-import tensorflow as tf
-
-import numpy as np
-
-from .external import ProcessModelGLM, ModelVarsGLM, BasicModelGraphGLM
-from .external import pkg_constants
-
-logger = logging.getLogger(__name__)
-
-
-class ProcessModel(ProcessModelGLM):
-
-    def param_bounds(
-            self,
-            dtype
-    ):
-        if isinstance(dtype, tf.DType):
-            dmin = dtype.min
-            dmax = dtype.max
-            dtype = dtype.as_numpy_dtype
-        else:
-            dtype = np.dtype(dtype)
-            dmin = np.finfo(dtype).min
-            dmax = np.finfo(dtype).max
-            dtype = dtype.type
-
-        zero = np.nextafter(0, np.inf, dtype=dtype)
-        one = np.nextafter(1, -np.inf, dtype=dtype)
-
-        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
-        bounds_min = {
-            "a_var": np.log(zero/(1-zero)) / sf,
-            "b_var": np.log(zero) / sf,
-            "eta_loc": np.log(zero/(1-zero)) / sf,
-            "eta_scale": np.log(zero) / sf,
-            "mean": np.nextafter(0, np.inf, dtype=dtype),
-            "samplesize": np.nextafter(0, np.inf, dtype=dtype),
-            "probs": dtype(0),
-            "log_probs": np.log(zero),
-        }
-        bounds_max = {
-            "a_var": np.log(one/(1-one)) / sf,
-            "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
-            "eta_loc": np.log(one/(1-one)) / sf,
-            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
-            "mean": one,
-            "samplesize": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
-            "probs": dtype(1),
-            "log_probs": dtype(0),
-        }
-        return bounds_min, bounds_max
-
-
-class ModelVars(ProcessModel, ModelVarsGLM):
-    """
-    Full class.
-    """
-
-
-class BasicModelGraph(ProcessModel, BasicModelGraphGLM):
-
-    def __init__(
-            self,
-            X,
-            design_loc,
-            design_scale,
-            constraints_loc,
-            constraints_scale,
-            a_var,
-            b_var,
-            dtype,
-            size_factors=None
-    ):
-        a_var = self.tf_clip_param(a_var, "a_var")
-        b_var = self.tf_clip_param(b_var, "b_var")
-
-        if constraints_loc is not None:
-            eta_loc = tf.matmul(design_loc, tf.matmul(constraints_loc, a_var))
-        else:
-            eta_loc = tf.matmul(design_loc, a_var)
-
-        eta_loc = self.tf_clip_param(eta_loc, "eta_loc")
-
-        if constraints_scale is not None:
-            eta_scale = tf.matmul(design_scale, tf.matmul(constraints_scale, b_var))
-        else:
-            eta_scale = tf.matmul(design_scale, b_var)
-
-        eta_scale = self.tf_clip_param(eta_scale, "eta_scale")
-        
-        # Inverse linker functions:
-        model_loc = 1/(1+tf.exp(-eta_loc))
-        model_scale = tf.exp(eta_scale)
-
-        # Log-likelihood:
-        if isinstance(X, tf.SparseTensor):
-            one_minus_X = -tf.sparse.add(X, -tf.ones_like(model_loc))
-        else:
-            one_minus_X = 1 - X
-
-        one_minus_loc = 1 - model_loc
-        log_probs = tf.math.lgamma(model_scale) - tf.math.lgamma(model_loc * model_scale)\
-                    - tf.math.lgamma(one_minus_loc * model_scale)\
-                    + (model_scale * model_loc - 1) * tf.math.log(one_minus_X)\
-                    + (one_minus_loc * model_scale - 1) * tf.math.log(one_minus_X)
-
-        log_probs = self.tf_clip_param(log_probs, "log_probs")
-
-        # Variance:
-        sigma2 = (model_loc * one_minus_loc) / (1 + model_scale)
-
-        self.X = X
-        self.design_loc = design_loc
-        self.design_scale = design_scale
-        self.constraints_loc = constraints_loc
-        self.constraints_scale = constraints_scale
-        self.a_var = a_var
-        self.b_var = b_var
-        self.size_factors = size_factors
-        self.dtype = dtype
-
-        self.eta_loc = eta_loc
-        self.eta_scale = eta_scale
-        self.model_loc = model_loc
-        self.model_scale = model_scale
-        self.mean = model_loc
-        self.samplesize = model_scale
-
-        self.log_probs = log_probs
-
-        self.sigma2 = sigma2
\ No newline at end of file
diff --git a/batchglm/train/tf1/glm_beta/reducible_tensors.py b/batchglm/train/tf1/glm_beta/reducible_tensors.py
deleted file mode 100644
index a89103ea..00000000
--- a/batchglm/train/tf1/glm_beta/reducible_tensors.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import logging
-
-from .external import ReducableTensorsGLMALL
-from .hessians import Hessians
-from .jacobians import Jacobians
-from .fim import FIM
-
-logger = logging.getLogger(__name__)
-
-
-class ReducibleTensors(Jacobians, Hessians, FIM, ReducableTensorsGLMALL):
-    """
-    """
diff --git a/batchglm/train/tf1/glm_beta/training_strategies.py b/batchglm/train/tf1/glm_beta/training_strategies.py
deleted file mode 100644
index 9bd8b271..00000000
--- a/batchglm/train/tf1/glm_beta/training_strategies.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from enum import Enum
-
-class TrainingStrategies(Enum):
-
-    AUTO = None
-    DEFAULT = [
-        {
-            "convergence_criteria": "all_converged_ll",
-            "stopping_criteria": 1e-8,
-            "use_batching": False,
-            "optim_algo": "nr_tr",
-        },
-    ]
-    INEXACT = [
-        {
-            "convergence_criteria": "all_converged_ll",
-            "stopping_criteria": 1e-6,
-            "use_batching": False,
-            "optim_algo": "nr_tr",
-        },
-    ]
-    EXACT = [
-        {
-            "convergence_criteria": "all_converged_ll",
-            "stopping_criteria": 1e-8,
-            "use_batching": False,
-            "optim_algo": "nr_tr",
-        },
-    ]
-    IRLS = [
-        {
-            "convergence_criteria": "all_converged_ll",
-            "stopping_criteria": 1e-8,
-            "use_batching": False,
-            "optim_algo": "irls_tr",
-        },
-    ]
\ No newline at end of file
diff --git a/batchglm/train/tf1/glm_nb/__init__.py b/batchglm/train/tf1/glm_nb/__init__.py
deleted file mode 100644
index 4db081bb..00000000
--- a/batchglm/train/tf1/glm_nb/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .estimator import Estimator
-from .estimator_graph import EstimatorGraph
-from .model import BasicModelGraph, ModelVars, ProcessModel
-from .hessians import Hessians
-from .fim import FIM
-from .jacobians import Jacobians
-from .reducible_tensors import ReducibleTensors
diff --git a/batchglm/train/tf1/glm_nb/estimator.py b/batchglm/train/tf1/glm_nb/estimator.py
deleted file mode 100644
index 65f0c592..00000000
--- a/batchglm/train/tf1/glm_nb/estimator.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import logging
-from typing import Union
-
-import numpy as np
-try:
-    import tensorflow as tf
-except:
-    tf = None
-
-from .external import TFEstimatorGLM, InputDataGLM, Model
-from .external import init_par
-from .estimator_graph import EstimatorGraph
-from .model import ProcessModel
-from .training_strategies import TrainingStrategies
-
-
-class Estimator(TFEstimatorGLM, ProcessModel):
-    """
-    Estimator for Generalized Linear Models (GLMs) with negative binomial noise.
-    Uses the natural logarithm as linker function.
-    """
-
-    def __init__(
-            self,
-            input_data: InputDataGLM,
-            batch_size: int = 512,
-            graph: tf.Graph = None,
-            init_model: Model = None,
-            init_a: Union[np.ndarray, str] = "AUTO",
-            init_b: Union[np.ndarray, str] = "AUTO",
-            quick_scale: bool = False,
-            model: EstimatorGraph = None,
-            provide_optimizers: dict = {
-                "gd": True,
-                "adam": True,
-                "adagrad": True,
-                "rmsprop": True,
-                "nr": True,
-                "nr_tr": True,
-                "irls": True,
-                "irls_gd": True,
-                "irls_tr": True,
-                "irls_gd_tr": True,
-            },
-            provide_batched: bool = False,
-            provide_fim: bool = False,
-            provide_hessian: bool = False,
-            optim_algos: list = [],
-            extended_summary=False,
-            dtype="float64",
-            **kwargs
-    ):
-        """
-        Performs initialisation and creates a new estimator.
-
-        :param input_data: InputData
-            The input data
-        :param batch_size: int
-            Size of mini-batches used.
-        :param graph: (optional) tf1.Graph
-        :param init_model: (optional)
-            If provided, this model will be used to initialize this Estimator.
-        :param init_a: (Optional)
-            Low-level initial values for a. Can be:
-
-            - str:
-                * "auto": automatically choose best initialization
-                * "random": initialize with random values
-                * "standard": initialize intercept with observed mean
-                * "init_model": initialize with another model (see `ìnit_model` parameter)
-                * "closed_form": try to initialize with closed form
-            - np.ndarray: direct initialization of 'a'
-        :param init_b: (Optional)
-            Low-level initial values for b. Can be:
-
-            - str:
-                * "auto": automatically choose best initialization
-                * "random": initialize with random values
-                * "standard": initialize with zeros
-                * "init_model": initialize with another model (see `ìnit_model` parameter)
-                * "closed_form": try to initialize with closed form
-            - np.ndarray: direct initialization of 'b'
-        :param quick_scale: bool
-            Whether `scale` will be fitted faster and maybe less accurate.
-            Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
-        :param model: EstimatorGraph
-            EstimatorGraph to use. Basically for debugging.
-        :param provide_optimizers:
-
-            E.g.    {"gd": False, "adam": False, "adagrad": False, "rmsprop": False,
-                    "nr": False, "nr_tr": True,
-                    "irls": False, "irls_gd": False, "irls_tr": False, "irls_gd_tr": False}
-        :param provide_batched: bool
-            Whether mini-batched optimizers should be provided.
-        :param provide_fim: Whether to compute fisher information matrix during training
-            Either supply provide_fim and provide_hessian or optim_algos.
-        :param provide_hessian: Whether to compute hessians during training
-            Either supply provide_fim and provide_hessian or optim_algos.
-        :param optim_algos: Algorithms that you want to use on this object. Depending on that,
-            the hessian and/or fisher information matrix are computed.
-            Either supply provide_fim and provide_hessian or optim_algos.
-        :param extended_summary: Include detailed information in the summaries.
-            Will increase runtime of summary writer, use only for debugging.
-        :param dtype: Precision used in tensorflow.
-        """
-        if tf is None:
-            raise ValueError("tensorflow could not be imported." +
-                             "Install tensorflow to use Estimators from the tf1 submodule")
-        self.TrainingStrategies = TrainingStrategies
-
-        self._input_data = input_data
-        init_a, init_b, train_loc, train_scale = init_par(
-            input_data=input_data,
-            init_a=init_a,
-            init_b=init_b,
-            init_model=None
-        )
-        self._train_loc = train_loc
-        self._train_scale = train_scale
-        init_a = init_a.astype(dtype)
-        init_b = init_b.astype(dtype)
-        if quick_scale:
-            self._train_scale = False
-
-        if len(optim_algos) > 0:
-            if np.any([x.lower() in ["nr", "nr_tr"] for x in optim_algos]):
-                provide_hessian = True
-            if np.any([x.lower() in ["irls", "irls_tr", "irls_gd", "irls_gd_tr"] for x in optim_algos]):
-                provide_fim = True
-
-        TFEstimatorGLM.__init__(
-            self=self,
-            input_data=input_data,
-            batch_size=batch_size,
-            graph=graph,
-            init_a=init_a,
-            init_b=init_b,
-            model=model,
-            provide_optimizers=provide_optimizers,
-            provide_batched=provide_batched,
-            provide_fim=provide_fim,
-            provide_hessian=provide_hessian,
-            extended_summary=extended_summary,
-            noise_model="nb",
-            dtype=dtype
-        )
-
-    def get_model_container(
-            self,
-            input_data
-    ):
-        return Model(input_data=input_data)
diff --git a/batchglm/train/tf1/glm_nb/estimator_graph.py b/batchglm/train/tf1/glm_nb/estimator_graph.py
deleted file mode 100644
index 8e609600..00000000
--- a/batchglm/train/tf1/glm_nb/estimator_graph.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import logging
-
-from .model import ProcessModel
-from .external import EstimatorGraphAll
-
-logger = logging.getLogger(__name__)
-
-
-class EstimatorGraph(ProcessModel, EstimatorGraphAll):
-    """
-    Full class.
-    """
diff --git a/batchglm/train/tf1/glm_nb/external.py b/batchglm/train/tf1/glm_nb/external.py
deleted file mode 100644
index 5f04c9cf..00000000
--- a/batchglm/train/tf1/glm_nb/external.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import batchglm.data as data_utils
-
-from batchglm.models.glm_nb import _EstimatorGLM, InputDataGLM, Model
-from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale
-from batchglm.models.glm_nb.utils import init_par
-
-import batchglm.train.tf1.train as train_utils
-from batchglm.train.tf1.base import TFEstimatorGraph
-
-from batchglm.train.tf1.base_glm import GradientGraphGLM, NewtonGraphGLM, TrainerGraphGLM, EstimatorGraphGLM, FullDataModelGraphGLM, BasicModelGraphGLM
-from batchglm.train.tf1.base_glm import ProcessModelGLM, ModelVarsGLM
-from batchglm.train.tf1.base_glm import HessiansGLM, FIMGLM, JacobiansGLM
-
-from batchglm.train.tf1.base_glm_all import TFEstimatorGLM, EstimatorGraphAll, FIMGLMALL, HessianGLMALL, JacobiansGLMALL, ReducableTensorsGLMALL
-
-from batchglm.utils.linalg import groupwise_solve_lm
-from batchglm import pkg_constants
diff --git a/batchglm/train/tf1/glm_nb/fim.py b/batchglm/train/tf1/glm_nb/fim.py
deleted file mode 100644
index 32248733..00000000
--- a/batchglm/train/tf1/glm_nb/fim.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import tensorflow as tf
-
-import logging
-
-from .external import FIMGLMALL
-
-logger = logging.getLogger(__name__)
-
-
-class FIM(FIMGLMALL):
-
-    def _weight_fim_aa(
-            self,
-            loc,
-            scale
-    ):
-        const = tf.divide(scale, scale + loc)
-        W = tf.multiply(loc, const)
-
-        return W
-
-    def _weight_fim_bb(
-            self,
-            loc,
-            scale
-    ):
-        return tf.zeros_like(scale)
-        #scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
-        #scalar_two = tf.constant(2, shape=(), dtype=self.dtype)
-        #scale_plus_loc = scale + loc
-        #digamma_r = tf.math.digamma(x=scale)
-        #digamma_r_plus_mu = tf.math.digamma(x=scale_plus_loc)
-        #const1 = tf.multiply(scalar_two, tf.add(
-        #    digamma_r,
-        #    digamma_r_plus_mu
-        #))
-        #const2 = tf.multiply(scale, tf.add(
-        #    tf.math.polygamma(a=scalar_one, x=scale),
-        #    tf.math.polygamma(a=scalar_one, x=scale_plus_loc)
-        #))
-        #const3 = tf.divide(scale, scale_plus_loc)
-        #W = tf.multiply(scale, tf.add_n([const1, const2, const3]))
-        #return W
diff --git a/batchglm/train/tf1/glm_nb/hessians.py b/batchglm/train/tf1/glm_nb/hessians.py
deleted file mode 100644
index 87562c26..00000000
--- a/batchglm/train/tf1/glm_nb/hessians.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import tensorflow as tf
-
-import logging
-
-from .external import HessianGLMALL
-
-logger = logging.getLogger(__name__)
-
-
-class Hessians(HessianGLMALL):
-
-    def _weight_hessian_ab(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        if isinstance(X, tf.SparseTensor):
-            X_minus_mu = tf.sparse.add(X, -loc)
-        else:
-            X_minus_mu = X - loc
-
-        const = tf.multiply(
-            loc * scale,
-            tf.divide(
-                X_minus_mu,
-                tf.square(loc + scale)
-            )
-        )
-
-        return const
-
-    def _weight_hessian_aa(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        if isinstance(X, tf.SparseTensor):
-            X_by_scale_plus_one = tf.sparse.add(X.__div__(scale), tf.ones_like(scale))
-        else:
-            X_by_scale_plus_one = X / scale + tf.ones_like(scale)
-
-        const = tf.negative(tf.multiply(
-            loc,
-            tf.divide(
-                X_by_scale_plus_one,
-                tf.square((loc / scale) + tf.ones_like(loc))
-            )
-        ))
-
-        return const
-
-    def _weight_hessian_bb(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        if isinstance(X, tf.SparseTensor):
-            scale_plus_x = tf.sparse.add(X, scale)
-        else:
-            scale_plus_x = X + scale
-
-        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
-        scalar_two = tf.constant(2, shape=(), dtype=self.dtype)
-        # Pre-define sub-graphs that are used multiple times:
-        scale_plus_loc = scale + loc
-        # Define graphs for individual terms of constant term of hessian:
-        const1 = tf.add(
-            tf.math.digamma(x=scale_plus_x),
-            scale * tf.math.polygamma(a=scalar_one, x=scale_plus_x)
-        )
-        const2 = tf.negative(tf.add(
-            tf.math.digamma(x=scale),
-            scale * tf.math.polygamma(a=scalar_one, x=scale)
-        ))
-        const3 = tf.negative(tf.divide(
-            tf.add(
-                loc * scale_plus_x,
-                scalar_two * scale * scale_plus_loc
-            ),
-            tf.math.square(scale_plus_loc)
-        ))
-        const4 = tf.add(
-            tf.math.log(scale),
-            scalar_two - tf.math.log(scale_plus_loc)
-        )
-        const = tf.add_n([const1, const2, const3, const4])
-        const = tf.multiply(scale, const)
-        return const
-
-
diff --git a/batchglm/train/tf1/glm_nb/jacobians.py b/batchglm/train/tf1/glm_nb/jacobians.py
deleted file mode 100644
index 59c6b174..00000000
--- a/batchglm/train/tf1/glm_nb/jacobians.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import logging
-
-import tensorflow as tf
-
-from .external import JacobiansGLMALL
-
-logger = logging.getLogger(__name__)
-
-
-class Jacobians(JacobiansGLMALL):
-
-    def _weights_jac_a(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        if isinstance(X, tf.SparseTensor):
-            const = tf.multiply(
-                tf.sparse.add(X, scale),
-                tf.divide(
-                    loc,
-                    tf.add(loc, scale)
-                )
-            )
-            const = tf.sparse.add(X, -const)
-        else:
-            const = tf.multiply(
-                tf.add(X, scale),
-                tf.divide(
-                    loc,
-                    tf.add(loc, scale)
-                )
-            )
-            const = tf.subtract(X, const)
-        return const
-
-    def _weights_jac_b(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        # Pre-define sub-graphs that are used multiple times:
-        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
-        if isinstance(X, tf.SparseTensor):
-            scale_plus_x = tf.sparse.add(X, scale)
-        else:
-            scale_plus_x = scale + X
-
-        r_plus_mu = scale + loc
-
-        # Define graphs for individual terms of constant term of hessian:
-        const1 = tf.subtract(
-            tf.math.digamma(x=scale_plus_x),
-            tf.math.digamma(x=scale)
-        )
-        const2 = tf.negative(scale_plus_x / r_plus_mu)
-        const3 = tf.add(
-            tf.math.log(scale),
-            scalar_one - tf.math.log(r_plus_mu)
-        )
-        const = tf.add_n([const1, const2, const3])  # [observations, features]
-        const = scale * const
-
-        return const
diff --git a/batchglm/train/tf1/glm_nb/model.py b/batchglm/train/tf1/glm_nb/model.py
deleted file mode 100644
index fcec89ae..00000000
--- a/batchglm/train/tf1/glm_nb/model.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import logging
-import numpy as np
-import tensorflow as tf
-
-from .external import ProcessModelGLM, ModelVarsGLM, BasicModelGraphGLM
-from .external import pkg_constants
-
-logger = logging.getLogger(__name__)
-
-
-class ProcessModel(ProcessModelGLM):
-
-    def param_bounds(
-            self,
-            dtype
-    ):
-        if isinstance(dtype, tf.DType):
-            dmin = dtype.min
-            dmax = dtype.max
-            dtype = dtype.as_numpy_dtype
-        else:
-            dtype = np.dtype(dtype)
-            dmin = np.finfo(dtype).min
-            dmax = np.finfo(dtype).max
-            dtype = dtype.type
-
-        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
-        bounds_min = {
-            "a_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
-            "b_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
-            "eta_loc": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
-            "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
-            "mu": np.nextafter(0, np.inf, dtype=dtype),
-            "r": np.nextafter(0, np.inf, dtype=dtype),
-            "probs": dtype(0),
-            "log_probs": np.log(np.nextafter(0, np.inf, dtype=dtype)),
-        }
-        bounds_max = {
-            "a_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
-            "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
-            "eta_loc": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
-            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
-            "mu": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
-            "r": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
-            "probs": dtype(1),
-            "log_probs": dtype(0),
-        }
-        return bounds_min, bounds_max
-
-
-class ModelVars(ProcessModel, ModelVarsGLM):
-    """
-    Full class.
-    """
-
-
-class BasicModelGraph(ProcessModel, BasicModelGraphGLM):
-
-    def __init__(
-            self,
-            X,
-            design_loc,
-            design_scale,
-            constraints_loc,
-            constraints_scale,
-            a_var,
-            b_var,
-            dtype,
-            size_factors=None
-    ):
-        a_var = self.tf_clip_param(a_var, "a_var")
-        b_var = self.tf_clip_param(b_var, "b_var")
-
-        if constraints_loc is not None:
-            eta_loc = tf.matmul(design_loc, tf.matmul(constraints_loc, a_var))
-        else:
-            eta_loc = tf.matmul(design_loc, a_var)
-
-        if size_factors is not None:
-            eta_loc = tf.add(eta_loc, tf.math.log(size_factors))
-
-        eta_loc = self.tf_clip_param(eta_loc, "eta_loc")
-
-        if constraints_scale is not None:
-            eta_scale = tf.matmul(design_scale, tf.matmul(constraints_scale, b_var))
-        else:
-            eta_scale = tf.matmul(design_scale, b_var)
-
-        eta_scale = self.tf_clip_param(eta_scale, "eta_scale")
-        
-        # Inverse linker functions:
-        model_loc = tf.math.exp(eta_loc)
-        model_scale = tf.math.exp(eta_scale)
-
-        # Log-likelihood:
-        log_r_plus_mu = tf.math.log(model_scale + model_loc)
-        if isinstance(X, tf.SparseTensor):
-            log_probs_sparse = X.__mul__(eta_loc - log_r_plus_mu)
-            log_probs_dense = tf.math.lgamma(tf.sparse.add(X, model_scale)) - \
-                              tf.math.lgamma(tf.sparse.add(X, tf.ones(shape=X.dense_shape, dtype=dtype))) - \
-                              tf.math.lgamma(model_scale) + \
-                              tf.multiply(model_scale, eta_scale - log_r_plus_mu)
-            log_probs = tf.sparse.add(log_probs_sparse, log_probs_dense)
-            log_probs.set_shape([None, a_var.shape[1]])  # Need this so as shape is completely lost.
-        else:
-            log_probs = tf.math.lgamma(model_scale + X) - \
-                        tf.math.lgamma(X + tf.ones_like(X)) - \
-                        tf.math.lgamma(model_scale) + \
-                        tf.multiply(X, eta_loc - log_r_plus_mu) + \
-                        tf.multiply(model_scale, eta_scale - log_r_plus_mu)
-
-        log_probs = self.tf_clip_param(log_probs, "log_probs")
-
-        # Variance:
-        sigma2 = model_loc + tf.multiply(tf.square(model_loc), model_scale)
-
-        self.X = X
-        self.design_loc = design_loc
-        self.design_scale = design_scale
-        self.constraints_loc = constraints_loc
-        self.constraints_scale = constraints_scale
-        self.a_var = a_var
-        self.b_var = b_var
-        self.size_factors = size_factors
-        self.dtype = dtype
-
-        self.eta_loc = eta_loc
-        self.eta_scale = eta_scale
-        self.model_loc = model_loc
-        self.model_scale = model_scale
-        self.mu = model_loc
-        self.r = model_scale
-
-        self.log_probs = log_probs
-
-        self.sigma2 = sigma2
diff --git a/batchglm/train/tf1/glm_nb/reducible_tensors.py b/batchglm/train/tf1/glm_nb/reducible_tensors.py
deleted file mode 100644
index 862ccaf8..00000000
--- a/batchglm/train/tf1/glm_nb/reducible_tensors.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import logging
-
-from .external import ReducableTensorsGLMALL
-from .hessians import Hessians
-from .jacobians import Jacobians
-from .fim import FIM
-
-logger = logging.getLogger("batchglm")
-
-
-class ReducibleTensors(Jacobians, Hessians, FIM, ReducableTensorsGLMALL):
-    """
-    """
diff --git a/batchglm/train/tf1/glm_nb/training_strategies.py b/batchglm/train/tf1/glm_nb/training_strategies.py
deleted file mode 100644
index d9e57377..00000000
--- a/batchglm/train/tf1/glm_nb/training_strategies.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from enum import Enum
-
-
-class TrainingStrategies(Enum):
-
-    AUTO = None
-    DEFAULT = [
-        {
-            "convergence_criteria": "all_converged",
-            "use_batching": False,
-            "optim_algo": "irls_gd_tr",
-        },
-    ]
-    IRLS = [
-        {
-            "convergence_criteria": "all_converged",
-            "use_batching": False,
-            "optim_algo": "irls_gd_tr",
-        },
-    ]
-    IRLS_BATCHED = [
-        {
-            "convergence_criteria": "all_converged",
-            "use_batching": True,
-            "optim_algo": "irls_gd_tr",
-        },
-    ]
diff --git a/batchglm/train/tf1/glm_norm/__init__.py b/batchglm/train/tf1/glm_norm/__init__.py
deleted file mode 100644
index 4db081bb..00000000
--- a/batchglm/train/tf1/glm_norm/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .estimator import Estimator
-from .estimator_graph import EstimatorGraph
-from .model import BasicModelGraph, ModelVars, ProcessModel
-from .hessians import Hessians
-from .fim import FIM
-from .jacobians import Jacobians
-from .reducible_tensors import ReducibleTensors
diff --git a/batchglm/train/tf1/glm_norm/estimator.py b/batchglm/train/tf1/glm_norm/estimator.py
deleted file mode 100644
index bd1778ac..00000000
--- a/batchglm/train/tf1/glm_norm/estimator.py
+++ /dev/null
@@ -1,325 +0,0 @@
-import logging
-import numpy as np
-import scipy.sparse
-import tensorflow as tf
-from typing import Union
-
-from .external import TFEstimatorGLM, InputDataGLM, Model
-from .external import closedform_norm_glm_mean, closedform_norm_glm_logsd
-from .estimator_graph import EstimatorGraph
-from .model import ProcessModel
-from .training_strategies import TrainingStrategies
-
-logger = logging.getLogger("batchglm")
-
-
-class Estimator(TFEstimatorGLM, ProcessModel):
-    """
-    Estimator for Generalized Linear Models (GLMs) with normal distributed noise.
-    Uses the identity function as linker function for loc and a log-linker function for scale.
-    """
-
-    def __init__(
-            self,
-            input_data: InputDataGLM,
-            batch_size: int = 512,
-            graph: tf.Graph = None,
-            init_model: Model = None,
-            init_a: Union[np.ndarray, str] = "AUTO",
-            init_b: Union[np.ndarray, str] = "AUTO",
-            quick_scale: bool = False,
-            model: EstimatorGraph = None,
-            provide_optimizers: dict = {
-                "gd": True,
-                "adam": True,
-                "adagrad": True,
-                "rmsprop": True,
-                "nr": True,
-                "nr_tr": True,
-                "irls": True,
-                "irls_gd": True,
-                "irls_tr": True,
-                "irls_gd_tr": True,
-            },
-            provide_batched: bool = False,
-            provide_fim: bool = False,
-            provide_hessian: bool = False,
-            optim_algos: list = [],
-            extended_summary=False,
-            dtype="float64"
-    ):
-        """
-        Performs initialisation and creates a new estimator.
-
-        :param input_data: InputData
-            The input data
-        :param batch_size: int
-            Size of mini-batches used.
-        :param graph: (optional) tf1.Graph
-        :param init_model: (optional)
-            If provided, this model will be used to initialize this Estimator.
-        :param init_a: (Optional)
-            Low-level initial values for a. Can be:
-
-            - str:
-                * "auto": automatically choose best initialization
-                * "all zero": initialize with zeros
-                * "random": initialize with random values
-                * "standard": initialize intercept with observed mean
-                * "init_model": initialize with another model (see `ìnit_model` parameter)
-                * "closed_form": try to initialize with closed form
-            - np.ndarray: direct initialization of 'a'
-        :param init_b: (Optional)
-            Low-level initial values for b. Can be:
-
-            - str:
-                * "auto": automatically choose best initialization
-                * "random": initialize with random values
-                * "standard": initialize with zeros
-                * "init_model": initialize with another model (see `ìnit_model` parameter)
-                * "closed_form": try to initialize with closed form
-            - np.ndarray: direct initialization of 'b'
-        :param quick_scale: bool
-            Whether `scale` will be fitted faster and maybe less accurate.
-            Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
-        :param model: EstimatorGraph
-            EstimatorGraph to use. Basically for debugging.
-        :param provide_optimizers:
-
-            E.g.    {"gd": False, "adam": False, "adagrad": False, "rmsprop": False,
-                    "nr": False, "nr_tr": True,
-                    "irls": False, "irls_gd": False, "irls_tr": False, "irls_gd_tr": False}
-        :param provide_batched: bool
-            Whether mini-batched optimizers should be provided.
-        :param provide_fim: Whether to compute fisher information matrix during training
-            Either supply provide_fim and provide_hessian or optim_algos.
-        :param provide_hessian: Whether to compute hessians during training
-            Either supply provide_fim and provide_hessian or optim_algos.
-        :param optim_algos: Algorithms that you want to use on this object. Depending on that,
-            the hessian and/or fisher information matrix are computed.
-            Either supply provide_fim and provide_hessian or optim_algos.
-        :param extended_summary: Include detailed information in the summaries.
-            Will increase runtime of summary writer, use only for debugging.
-        :param dtype: Precision used in tensorflow.
-        """
-        self.TrainingStrategies = TrainingStrategies
-
-        self._input_data = input_data
-        self._train_loc = True
-        self._train_scale = True
-
-        (init_a, init_b) = self.init_par(
-            input_data=input_data,
-            init_a=init_a,
-            init_b=init_b,
-            init_model=init_model
-        )
-        init_a = init_a.astype(dtype)
-        init_b = init_b.astype(dtype)
-        if quick_scale:
-            self._train_scale = False
-
-        if len(optim_algos) > 0:
-            if np.any([x.lower() in ["nr", "nr_tr"] for x in optim_algos]):
-                provide_hessian = True
-            if np.any([x.lower() in ["irls", "irls_tr"] for x in optim_algos]):
-                provide_fim = True
-
-        TFEstimatorGLM.__init__(
-            self=self,
-            input_data=input_data,
-            batch_size=batch_size,
-            graph=graph,
-            init_a=init_a,
-            init_b=init_b,
-            model=model,
-            provide_optimizers=provide_optimizers,
-            provide_batched=provide_batched,
-            provide_fim=provide_fim,
-            provide_hessian=provide_hessian,
-            extended_summary=extended_summary,
-            noise_model="norm",
-            dtype=dtype
-        )
-
-    def get_model_container(
-            self,
-            input_data
-    ):
-        return Model(input_data=input_data)
-
-    def init_par(
-            self,
-            input_data,
-            init_a,
-            init_b,
-            init_model
-    ):
-        r"""
-        standard:
-        Only initialise intercept and keep other coefficients as zero.
-
-        closed-form:
-        Initialize with Maximum Likelihood / Maximum of Momentum estimators
-
-        Idea:
-        $$
-            \theta &= f(x) \\
-            \Rightarrow f^{-1}(\theta) &= x \\
-                &= (D \cdot D^{+}) \cdot x \\
-                &= D \cdot (D^{+} \cdot x) \\
-                &= D \cdot x' = f^{-1}(\theta)
-        $$
-        """
-
-        sf_given = False
-        if input_data.size_factors is not None:
-            if np.any(np.abs(input_data.size_factors - 1.) > 1e-8):
-                sf_given = True
-
-        is_ols_model = input_data.design_scale.shape[1] == 1 and \
-                       np.all(np.abs(input_data.design_scale - 1.) < 1e-8) and \
-                       not sf_given
-
-        if init_model is None:
-            groupwise_means = None
-            init_a_str = None
-            if isinstance(init_a, str):
-                init_a_str = init_a.lower()
-                # Chose option if auto was chosen
-                if init_a.lower() == "auto":
-                    init_a = "closed_form"
-
-                if init_a.lower() == "closed_form" or init_a.lower() == "standard":
-                    design_constr = np.matmul(input_data.design_loc, input_data.constraints_loc)
-                    # Iterate over genes if X is sparse to avoid large sparse tensor.
-                    # If X is dense, the least square problem can be vectorised easily.
-                    if isinstance(input_data.x, scipy.sparse.csr_matrix):
-                        init_a, rmsd_a, _, _ = np.linalg.lstsq(
-                            np.matmul(design_constr.T, design_constr),
-                            input_data.x.T.dot(design_constr).T,  # need double .T because of dot product on sparse.
-                            rcond=None
-                        )
-                    else:
-                        init_a, rmsd_a, _, _ = np.linalg.lstsq(
-                            np.matmul(design_constr.T, design_constr),
-                            np.matmul(design_constr.T, input_data.x),
-                            rcond=None
-                        )
-                    groupwise_means = None
-                    if is_ols_model:
-                        self._train_loc = False
-
-                    logger.debug("Using OLS initialization for location model")
-                elif init_a.lower() == "all_zero":
-                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
-                    self._train_loc = True
-
-                    logger.debug("Using all_zero initialization for mean")
-                else:
-                    raise ValueError("init_a string %s not recognized" % init_a)
-                logger.debug("Should train location model: %s", self._train_loc)
-
-            if isinstance(init_b, str):
-                if init_b.lower() == "auto":
-                    init_b = "standard"
-
-                if is_ols_model:
-                    # Calculated variance via E(x)^2 or directly depending on whether `mu` was specified.
-                    if isinstance(input_data.x, scipy.sparse.csr_matrix):
-                        expect_xsq = np.asarray(np.mean(input_data.x.power(2), axis=0))
-                    else:
-                        expect_xsq = np.expand_dims(np.mean(np.square(input_data.x), axis=0), axis=0)
-                    mean_model = np.matmul(
-                        np.matmul(input_data.design_loc, input_data.constraints_loc),
-                        init_a
-                    )
-                    expect_x_sq = np.mean(np.square(mean_model), axis=0)
-                    variance = (expect_xsq - expect_x_sq)
-                    init_b = np.log(np.sqrt(variance))
-                    self._train_scale = False
-
-                    logger.debug("Using residuals from OLS estimate for variance estimate")
-                elif init_b.lower() == "closed_form":
-                    dmats_unequal = False
-                    if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]:
-                        if np.any(input_data.design_loc != input_data.design_scale):
-                            dmats_unequal = True
-
-                    inits_unequal = False
-                    if init_a_str is not None:
-                        if init_a_str != init_b:
-                            inits_unequal = True
-
-                    # Watch out: init_mean is full obs x features matrix and is very large in many cases.
-                    if inits_unequal or dmats_unequal:
-                        raise ValueError("cannot use closed_form init for scale model " +
-                                         "if scale model differs from loc model")
-
-                    groupwise_scales, init_b, rmsd_b = closedform_norm_glm_logsd(
-                        x=input_data.x,
-                        design_scale=input_data.design_scale,
-                        constraints=input_data.constraints_scale,
-                        size_factors=input_data.size_factors,
-                        groupwise_means=groupwise_means,
-                        link_fn=lambda sd: np.log(self.np_clip_param(sd, "sd"))
-                    )
-
-                    # train scale, if the closed-form solution is inaccurate
-                    self._train_scale = not (np.all(rmsd_b == 0) or rmsd_b.size == 0)
-
-                    logger.debug("Using closed-form MME initialization for standard deviation")
-                elif init_b.lower() == "standard":
-                    groupwise_scales, init_b_intercept, rmsd_b = closedform_norm_glm_logsd(
-                        x=input_data.x,
-                        design_scale=input_data.design_scale[:, [0]],
-                        constraints=input_data.constraints_scale[[0], :][:, [0]],
-                        size_factors=input_data.size_factors,
-                        groupwise_means=None,
-                        link_fn=lambda sd: np.log(self.np_clip_param(sd, "sd"))
-                    )
-                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
-                    init_b[0, :] = init_b_intercept
-
-                    # train scale, if the closed-form solution is inaccurate
-                    self._train_scale = not (np.all(rmsd_b == 0) or rmsd_b.size == 0)
-
-                    logger.debug("Using closed-form MME initialization for standard deviation")
-                    logger.debug("Should train sd: %s", self._train_scale)
-                elif init_b.lower() == "all_zero":
-                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
-
-                    logger.debug("Using standard initialization for standard deviation")
-                else:
-                    raise ValueError("init_b string %s not recognized" % init_b)
-                logger.debug("Should train sd: %s", self._train_scale)
-        else:
-            # Locations model:
-            if isinstance(init_a, str) and (init_a.lower() == "auto" or init_a.lower() == "init_model"):
-                my_loc_names = set(input_data.loc_names)
-                my_loc_names = my_loc_names.intersection(set(init_model.input_data.loc_names))
-
-                init_loc = np.zeros([input_data.num_loc_params, input_data.num_features])
-                for parm in my_loc_names:
-                    init_idx = np.where(init_model.input_data.loc_names == parm)[0]
-                    my_idx = np.where(input_data.loc_names == parm)[0]
-                    init_loc[my_idx] = init_model.a_var[init_idx]
-
-                init_a = init_loc
-                logger.debug("Using initialization based on input model for mean")
-
-            # Scale model:
-            if isinstance(init_b, str) and (init_b.lower() == "auto" or init_b.lower() == "init_model"):
-                my_scale_names = set(input_data.scale_names)
-                my_scale_names = my_scale_names.intersection(init_model.input_data.scale_names)
-
-                init_scale = np.zeros([input_data.num_scale_params, input_data.num_features])
-                for parm in my_scale_names:
-                    init_idx = np.where(init_model.input_data.scale_names == parm)[0]
-                    my_idx = np.where(input_data.scale_names == parm)[0]
-                    init_scale[my_idx] = init_model.b_var[init_idx]
-
-                init_b = init_scale
-                logger.debug("Using initialization based on input model for dispersion")
-
-        return init_a, init_b
diff --git a/batchglm/train/tf1/glm_norm/estimator_graph.py b/batchglm/train/tf1/glm_norm/estimator_graph.py
deleted file mode 100644
index 8e609600..00000000
--- a/batchglm/train/tf1/glm_norm/estimator_graph.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import logging
-
-from .model import ProcessModel
-from .external import EstimatorGraphAll
-
-logger = logging.getLogger(__name__)
-
-
-class EstimatorGraph(ProcessModel, EstimatorGraphAll):
-    """
-    Full class.
-    """
diff --git a/batchglm/train/tf1/glm_norm/external.py b/batchglm/train/tf1/glm_norm/external.py
deleted file mode 100644
index 3acba1c9..00000000
--- a/batchglm/train/tf1/glm_norm/external.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import batchglm.data as data_utils
-
-from batchglm.models.glm_norm import _EstimatorGLM, InputDataGLM, Model
-from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale
-from batchglm.models.glm_norm.utils import closedform_norm_glm_mean, closedform_norm_glm_logsd
-
-import batchglm.train.tf1.ops as op_utils
-import batchglm.train.tf1.train as train_utils
-from batchglm.train.tf1.base import TFEstimatorGraph
-
-from batchglm.train.tf1.base_glm import GradientGraphGLM, NewtonGraphGLM, TrainerGraphGLM, EstimatorGraphGLM, FullDataModelGraphGLM, BasicModelGraphGLM
-from batchglm.train.tf1.base_glm import ProcessModelGLM, ModelVarsGLM
-from batchglm.train.tf1.base_glm import HessiansGLM, FIMGLM, JacobiansGLM
-
-from batchglm.train.tf1.base_glm_all import TFEstimatorGLM, EstimatorGraphAll, FIMGLMALL, HessianGLMALL, JacobiansGLMALL, ReducableTensorsGLMALL
-
-from batchglm.utils.linalg import groupwise_solve_lm
-from batchglm import pkg_constants
diff --git a/batchglm/train/tf1/glm_norm/fim.py b/batchglm/train/tf1/glm_norm/fim.py
deleted file mode 100644
index 06fce476..00000000
--- a/batchglm/train/tf1/glm_norm/fim.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import tensorflow as tf
-
-import logging
-
-from .external import FIMGLMALL
-
-logger = logging.getLogger(__name__)
-
-
-class FIM(FIMGLMALL):
-
-    def _weight_fim_aa(
-            self,
-            loc,
-            scale
-    ):
-        W = tf.square(tf.divide(tf.ones_like(scale), scale))
-
-        return W
-
-    def _weight_fim_bb(
-            self,
-            loc,
-            scale
-    ):
-        W = tf.constant(2, shape=loc.shape, dtype=self.dtype)
-
-        return W
diff --git a/batchglm/train/tf1/glm_norm/hessians.py b/batchglm/train/tf1/glm_norm/hessians.py
deleted file mode 100644
index 69238c12..00000000
--- a/batchglm/train/tf1/glm_norm/hessians.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import tensorflow as tf
-
-import logging
-
-from .external import HessianGLMALL
-
-logger = logging.getLogger(__name__)
-
-
-class Hessians(HessianGLMALL):
-
-    def _weight_hessian_ab(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        scalar_two = tf.constant(2, shape=(), dtype=self.dtype)
-        if isinstance(X, tf.SparseTensor):
-            X_minus_loc = tf.sparse.add(X, -loc)
-        else:
-            X_minus_loc = X - loc
-
-        const = - tf.multiply(scalar_two,
-            tf.divide(
-                X_minus_loc,
-                tf.square(scale)
-            )
-        )
-        return const
-
-    def _weight_hessian_aa(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
-        const = - tf.divide(scalar_one, tf.square(scale))
-
-        return const
-
-    def _weight_hessian_bb(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        scalar_two = tf.constant(2, shape=(), dtype=self.dtype)
-        if isinstance(X, tf.SparseTensor):
-            X_minus_loc = tf.sparse.add(X, -loc)
-        else:
-            X_minus_loc = X - loc
-
-        const = - tf.multiply(
-            scalar_two,
-            tf.math.square(
-                tf.divide(
-                    X_minus_loc,
-                    scale
-                )
-            )
-        )
-        return const
-
-
diff --git a/batchglm/train/tf1/glm_norm/jacobians.py b/batchglm/train/tf1/glm_norm/jacobians.py
deleted file mode 100644
index 04a60d88..00000000
--- a/batchglm/train/tf1/glm_norm/jacobians.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import logging
-
-import tensorflow as tf
-
-from .external import JacobiansGLMALL
-
-logger = logging.getLogger(__name__)
-
-
-class Jacobians(JacobiansGLMALL):
-
-    def _weights_jac_a(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        if isinstance(X, tf.SparseTensor):
-            const1 = tf.sparse.add(X, -loc)
-            const = tf.divide(const1, tf.square(scale))
-        else:
-            const1 = tf.subtract(X, loc)
-            const = tf.divide(const1, tf.square(scale))
-        return const
-
-    def _weights_jac_b(
-            self,
-            X,
-            loc,
-            scale,
-    ):
-        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
-        if isinstance(X, tf.SparseTensor):
-            const = tf.negative(scalar_one) + tf.math.square(
-                tf.divide(tf.sparse.add(X, -loc), scale)
-            )
-        else:
-            const = tf.negative(scalar_one) + tf.math.square(
-                tf.divide(tf.subtract(X, loc), scale)
-            )
-        return const
diff --git a/batchglm/train/tf1/glm_norm/model.py b/batchglm/train/tf1/glm_norm/model.py
deleted file mode 100644
index 0ac6efc0..00000000
--- a/batchglm/train/tf1/glm_norm/model.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import logging
-
-import tensorflow as tf
-
-import numpy as np
-
-from .external import ProcessModelGLM, ModelVarsGLM, BasicModelGraphGLM
-from .external import pkg_constants
-
-logger = logging.getLogger(__name__)
-
-
-class ProcessModel(ProcessModelGLM):
-
-    def param_bounds(
-            self,
-            dtype
-    ):
-        if isinstance(dtype, tf.DType):
-            dmin = dtype.min
-            dmax = dtype.max
-            dtype = dtype.as_numpy_dtype
-        else:
-            dtype = np.dtype(dtype)
-            dmin = np.finfo(dtype).min
-            dmax = np.finfo(dtype).max
-            dtype = dtype.type
-
-        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
-        bounds_min = {
-            "a_var": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
-            "b_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
-            "eta_loc": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
-            "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
-            "mean": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
-            "sd": np.nextafter(0, np.inf, dtype=dtype),
-            "probs": dtype(0),
-            "log_probs": np.log(np.nextafter(0, np.inf, dtype=dtype)),
-        }
-        bounds_max = {
-            "a_var": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
-            "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
-            "eta_loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
-            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
-            "mean": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
-            "sd": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
-            "probs": dtype(1),
-            "log_probs": dtype(0),
-        }
-        return bounds_min, bounds_max
-
-
-class ModelVars(ProcessModel, ModelVarsGLM):
-    """
-    Full class.
-    """
-
-
-class BasicModelGraph(ProcessModel, BasicModelGraphGLM):
-
-    def __init__(
-            self,
-            X,
-            design_loc,
-            design_scale,
-            constraints_loc,
-            constraints_scale,
-            a_var,
-            b_var,
-            dtype,
-            size_factors=None
-    ):
-        a_var = self.tf_clip_param(a_var, "a_var")
-        b_var = self.tf_clip_param(b_var, "b_var")
-
-        if constraints_loc is not None:
-            eta_loc = tf.matmul(design_loc, tf.matmul(constraints_loc, a_var))
-        else:
-            eta_loc = tf.matmul(design_loc, a_var)
-
-        if size_factors is not None:
-            eta_loc = tf.multiply(eta_loc, size_factors)
-
-        eta_loc = self.tf_clip_param(eta_loc, "eta_loc")
-
-        if constraints_scale is not None:
-            eta_scale = tf.matmul(design_scale, tf.matmul(constraints_scale, b_var))
-        else:
-            eta_scale = tf.matmul(design_scale, b_var)
-
-        eta_scale = self.tf_clip_param(eta_scale, "eta_scale")
-        
-        # Inverse linker functions:
-        model_loc = eta_loc
-        model_scale = tf.math.exp(eta_scale)
-
-        # Log-likelihood:
-        const = tf.constant(-0.5 * np.log(2 * np.pi), shape=(), dtype=dtype)
-        if isinstance(X, tf.SparseTensor):
-            log_probs = const - \
-                        eta_scale - \
-                        0.5 * tf.math.square(tf.divide(
-                            tf.sparse.add(X, - model_loc),
-                            model_scale
-                        ))
-            log_probs.set_shape([None, a_var.shape[1]])  # Need this so as shape is completely lost.
-        else:
-            log_probs = const - \
-                        eta_scale - \
-                        0.5 * tf.math.square(tf.divide(
-                            X - model_loc,
-                            model_scale
-                        ))
-        log_probs = self.tf_clip_param(log_probs, "log_probs")
-
-        # Variance:
-        sigma2 = tf.square(model_scale)
-
-        self.X = X
-        self.design_loc = design_loc
-        self.design_scale = design_scale
-        self.constraints_loc = constraints_loc
-        self.constraints_scale = constraints_scale
-        self.a_var = a_var
-        self.b_var = b_var
-        self.size_factors = size_factors
-        self.dtype = dtype
-
-        self.eta_loc = eta_loc
-        self.eta_scale = eta_scale
-        self.model_loc = model_loc
-        self.model_scale = model_scale
-        self.mean = model_loc
-        self.sd = model_scale
-
-        self.log_probs = log_probs
-
-        self.sigma2 = sigma2
diff --git a/batchglm/train/tf1/glm_norm/reducible_tensors.py b/batchglm/train/tf1/glm_norm/reducible_tensors.py
deleted file mode 100644
index 862ccaf8..00000000
--- a/batchglm/train/tf1/glm_norm/reducible_tensors.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import logging
-
-from .external import ReducableTensorsGLMALL
-from .hessians import Hessians
-from .jacobians import Jacobians
-from .fim import FIM
-
-logger = logging.getLogger("batchglm")
-
-
-class ReducibleTensors(Jacobians, Hessians, FIM, ReducableTensorsGLMALL):
-    """
-    """
diff --git a/batchglm/train/tf1/glm_norm/training_strategies.py b/batchglm/train/tf1/glm_norm/training_strategies.py
deleted file mode 100644
index 2ba524a7..00000000
--- a/batchglm/train/tf1/glm_norm/training_strategies.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from enum import Enum
-
-
-class TrainingStrategies(Enum):
-
-    AUTO = None
-    DEFAULT = [
-        {
-            "convergence_criteria": "all_converged",
-            "use_batching": False,
-            "optim_algo": "irls_tr",
-        },
-    ]
-    IRLS = [
-        {
-            "convergence_criteria": "all_converged",
-            "use_batching": False,
-            "optim_algo": "irls_tr",
-        },
-    ]
-    IRLS_BATCHED = [
-        {
-            "convergence_criteria": "all_converged",
-            "use_batching": True,
-            "optim_algo": "irls_tr",
-        },
-    ]
diff --git a/batchglm/train/tf1/ops.py b/batchglm/train/tf1/ops.py
deleted file mode 100644
index 8c6ea45f..00000000
--- a/batchglm/train/tf1/ops.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import tensorflow as tf
-from typing import Union
-
-
-def swap_dims(tensor, axis0, axis1, exec_transpose=True, return_perm=False, name="swap_dims"):
-    """
-    Swaps two dimensions in a given tensor.
-
-    :param tensor: The tensor whose axes should be swapped
-    :param axis0: The first axis which should be swapped with `axis1`
-    :param axis1: The second axis which should be swapped with `axis0`
-    :param exec_transpose: Should the transpose operation be applied?
-    :param return_perm: Should the permutation argument for `tf1.transpose` be returned?
-        Autmoatically true, if `exec_transpose` is False
-    :param name: The name scope of this op
-    :return: either retval, (retval, permutation) or permutation
-    """
-    with tf.name_scope(name):
-        rank = tf.range(tf.rank(tensor))
-        idx0 = rank[axis0]
-        idx1 = rank[axis1]
-        perm0 = tf.where(tf.equal(rank, idx0), tf.tile(tf.expand_dims(idx1, 0), [tf.size(rank)]), rank)
-        perm1 = tf.where(tf.equal(rank, idx1), tf.tile(tf.expand_dims(idx0, 0), [tf.size(rank)]), perm0)
-
-    if exec_transpose:
-        retval = tf.transpose(tensor, perm1)
-
-        if return_perm:
-            return retval, perm1
-        else:
-            return retval
-    else:
-        return perm1
-
-
-def stacked_lstsq(L, b, rcond=1e-10, name="stacked_lstsq"):
-    r"""
-    Solve `Lx = b`, via SVD least squares cutting of small singular values
-
-    :param L: tensor of shape (..., M, K)
-    :param b: tensor of shape (..., M, N).
-    :param rcond: threshold for inverse
-    :param name: name scope of this op
-    :return: x of shape (..., K, N)
-    """
-    with tf.name_scope(name):
-        u, s, v = tf.linalg.svd(L, full_matrices=False)
-        s_max = s.max(axis=-1, keepdims=True)
-        s_min = rcond * s_max
-
-        inv_s = tf.where(s >= s_min, tf.reciprocal(s), 0)
-
-        x = tf.einsum(
-            '...MK,...MN->...KN',
-            v,
-            tf.einsum('...K,...MK,...MN->...KN', inv_s, u, b)
-        )
-
-        return tf.conj(x)
diff --git a/batchglm/train/tf1/train.py b/batchglm/train/tf1/train.py
deleted file mode 100644
index 151343f0..00000000
--- a/batchglm/train/tf1/train.py
+++ /dev/null
@@ -1,315 +0,0 @@
-import contextlib
-import logging
-import tensorflow as tf
-from typing import Union, Dict
-
-logger = logging.getLogger(__name__)
-
-
-class MultiTrainer:
-
-    def __init__(
-            self,
-            learning_rate,
-            loss=None,
-            variables: tf.Variable = None,
-            gradients: tf.Tensor = None,
-            apply_gradients: Union[callable, Dict[tf.Variable, callable]] = None,
-            newton_delta: tf.Tensor = None,
-            irls_delta: tf.Tensor = None,
-            irls_gd_delta: tf.Tensor = None,
-            train_ops_nr_tr=None,
-            train_ops_irls_tr=None,
-            train_ops_irls_gd_tr=None,
-            global_step=None,
-            apply_train_ops: callable = None,
-            provide_optimizers: Union[dict, None] = None,
-            session = None,
-            name=None
-    ):
-        r"""
-
-        :param learning_rate: learning rate used for training
-        :param loss: loss which should be minimized
-        :param variables: list of variables which will be trained
-        :param gradients: tensor of gradients of loss function with respect to trained parameters.
-            If gradients is not given, gradients are computed via tensorflow based on the given loss.
-        :param apply_gradients: callable(s) appliable to the gradients.
-            Can be either a single callable which will be applied to all gradients or a dict of
-            {tf1.Variable: callable} mappings.
-        :param newton_delta: tensor Precomputed custom newton-rhapson parameter update to apply.
-        :param irls_delta: tensor Precomputed custom IRLS parameter update to apply.
-        :param global_step: global step counter
-        :param apply_train_ops: callable which will be applied to all train ops
-        :param name: optional name scope
-        """
-        self.session = session
-        with contextlib.ExitStack() as stack:
-            if name is not None:
-                gs = stack.enter_context(tf.name_scope(name))
-
-            if gradients is None:
-                if variables is None:
-                    raise ValueError("Either variables and loss or gradients have to be specified")
-
-                logger.debug(" **** Compute gradients using tensorflow")
-                plain_gradients = tf.gradients(loss, variables)
-                plain_gradients_vars = [(g, v) for g, v in zip(plain_gradients, variables)]
-            else:
-                plain_gradients_vars = [(gradients, variables)]
-
-            if callable(apply_gradients):
-                gradients_vars = [(apply_gradients(g), v) for g, v in plain_gradients_vars]
-            elif isinstance(apply_gradients, dict):
-                gradients_vars = [(apply_gradients[v](g) if v in apply_gradients else g, v) for g, v in plain_gradients_vars]
-            else:
-                gradients_vars = plain_gradients_vars
-
-            # Standard tensorflow optimizers.
-            if provide_optimizers["gd"]:
-                logger.debug(" *** Building optimizer: GD")
-                optim_GD = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=learning_rate)
-                train_op_GD = optim_GD.apply_gradients(gradients_vars, global_step=global_step)
-                if apply_train_ops is not None:
-                    train_op_GD = apply_train_ops(train_op_GD)
-                update_op_GD = tf.multiply(gradients, learning_rate)
-            else:
-                optim_GD = None
-                train_op_GD = None
-                update_op_GD = None
-
-            if provide_optimizers["adam"]:
-                logger.debug(" *** Building optimizer: ADAM")
-                optim_Adam = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
-                train_op_Adam = optim_Adam.apply_gradients(gradients_vars, global_step=global_step)
-                if apply_train_ops is not None:
-                    train_op_Adam = apply_train_ops(train_op_Adam)
-                update_op_Adam = tf.multiply(gradients, learning_rate)  # TODO replace by actual step
-            else:
-                optim_Adam = None
-                train_op_Adam = None
-                update_op_Adam = None
-
-            if provide_optimizers["adagrad"]:
-                logger.debug(" *** Building optimizer: ADAGRAD")
-                optim_Adagrad = tf.compat.v1.train.AdagradOptimizer(learning_rate=learning_rate)
-                train_op_Adagrad = optim_Adagrad.apply_gradients(gradients_vars, global_step=global_step)
-                if apply_train_ops is not None:
-                    train_op_Adagrad = apply_train_ops(train_op_Adagrad)
-                update_op_Adagrad = tf.multiply(gradients, learning_rate)  # TODO replace by actual step
-            else:
-                optim_Adagrad = None
-                train_op_Adagrad = None
-                update_op_Adagrad = None
-
-            if provide_optimizers["rmsprop"]:
-                logger.debug(" *** Building optimizer: RMSPROP")
-                optim_RMSProp = tf.compat.v1.train.RMSPropOptimizer(learning_rate=learning_rate)
-                train_op_RMSProp = optim_RMSProp.apply_gradients(gradients_vars, global_step=global_step)
-                if apply_train_ops is not None:
-                    train_op_RMSProp = apply_train_ops(train_op_RMSProp)
-                update_op_RMSProp = tf.multiply(gradients, learning_rate)  # TODO replace by actual step
-            else:
-                optim_RMSProp = None
-                train_op_RMSProp = None
-                update_op_RMSProp = None
-
-            # Custom optimizers.
-            if provide_optimizers["nr"] and newton_delta is not None:
-                logger.debug(" *** Building optimizer: NR")
-                update_op_nr = newton_delta
-
-                theta_new_nr = variables - newton_delta
-                train_op_nr = tf.group(
-                    tf.compat.v1.assign(variables, theta_new_nr),
-                    tf.compat.v1.assign_add(global_step, 1)
-                )
-                if apply_train_ops is not None:
-                    train_op_nr = apply_train_ops(train_op_nr)
-            else:
-                train_op_nr = None
-                update_op_nr = None
-
-            if provide_optimizers["irls"] and irls_delta is not None:
-                logger.debug(" *** Building optimizer: IRLS")
-                update_op_irls = irls_delta
-
-                theta_new_irls = variables - irls_delta
-                train_op_irls = tf.group(
-                    tf.compat.v1.assign(variables, theta_new_irls),
-                    tf.compat.v1.assign_add(global_step, 1)
-                )
-                if apply_train_ops is not None:
-                    train_op_irls = apply_train_ops(train_op_irls)
-            else:
-                train_op_irls = None
-                update_op_irls = None
-
-            if provide_optimizers["irls_gd"] and irls_gd_delta is not None:
-                logger.debug(" *** Building optimizer: IRLS_GD")
-                update_op_irls_gd = irls_gd_delta
-
-                theta_new_irls_gd = variables - irls_gd_delta
-                train_op_irls_gd = tf.group(
-                    tf.compat.v1.assign(variables, theta_new_irls_gd),
-                    tf.compat.v1.assign_add(global_step, 1)
-                )
-                if apply_train_ops is not None:
-                    train_op_irls_gd = apply_train_ops(train_op_irls_gd)
-            else:
-                train_op_irls_gd = None
-                update_op_irls_gd = None
-
-            if provide_optimizers["nr_tr"] and train_ops_nr_tr is not None:
-                logger.debug(" *** Building optimizer: NR_TR")
-                train_op_nr_tr = {"trial_op": train_ops_nr_tr["trial_op"],
-                                  "update_op": tf.group(train_ops_nr_tr["update_op"],
-                                                        tf.compat.v1.assign_add(global_step, 1))}
-                update_op_nr_tr = train_ops_nr_tr["update"]
-            else:
-                train_op_nr_tr = None
-                update_op_nr_tr = None
-
-            if provide_optimizers["irls_tr"] and train_ops_irls_tr is not None:
-                logger.debug(" *** Building optimizer: IRLS_TR")
-                train_op_irls_tr = {"trial_op": train_ops_irls_tr["trial_op"],
-                                    "update_op": tf.group(train_ops_irls_tr["update_op"],
-                                                          tf.compat.v1.assign_add(global_step, 1))}
-                update_op_irls_tr = train_ops_irls_tr["update"]
-            else:
-                train_op_irls_tr = None
-                update_op_irls_tr = None
-
-            if provide_optimizers["irls_gd_tr"] and train_ops_irls_gd_tr is not None:
-                logger.debug(" *** Building optimizer: IRLS_GD_TR")
-                train_op_irls_gd_tr = {"trial_op": train_ops_irls_gd_tr["trial_op"],
-                                    "update_op": tf.group(train_ops_irls_gd_tr["update_op"],
-                                                          tf.compat.v1.assign_add(global_step, 1))}
-                update_op_irls_gd_tr = train_ops_irls_gd_tr["update"]
-            else:
-                train_op_irls_gd_tr = None
-                update_op_irls_gd_tr = None
-
-            self.global_step = global_step
-            self.plain_gradients = plain_gradients_vars
-            self.gradients = gradients_vars
-
-            self.optim_GD = optim_GD
-            self.optim_Adam = optim_Adam
-            self.optim_Adagrad = optim_Adagrad
-            self.optim_RMSProp = optim_RMSProp
-
-            self.train_op_GD = train_op_GD
-            self.train_op_Adam = train_op_Adam
-            self.train_op_Adagrad = train_op_Adagrad
-            self.train_op_RMSProp = train_op_RMSProp
-            self.train_op_nr = train_op_nr
-            self.train_op_nr_tr = train_op_nr_tr
-            self.train_op_irls = train_op_irls
-            self.train_op_irls_gd = train_op_irls_gd
-            self.train_op_irls_tr = train_op_irls_tr
-            self.train_op_irls_gd_tr = train_op_irls_gd_tr
-
-            self.update_op_GD = update_op_GD
-            self.update_op_Adam = update_op_Adam
-            self.update_op_Adagrad = update_op_Adagrad
-            self.update_op_RMSProp = update_op_RMSProp
-            self.update_op_nr = update_op_nr
-            self.update_op_nr_tr = update_op_nr_tr
-            self.update_op_irls = update_op_irls
-            self.update_op_irls_gd = update_op_irls_gd
-            self.update_op_irls_tr = update_op_irls_tr
-            self.update_op_irls_gd_tr = update_op_irls_gd_tr
-
-            #self.train_op_bfgs = train_op_bfgs
-
-
-    def train_op_by_name(self, name: str):
-        """
-        Returns the train op specified by the provided name
-        
-        :param name: name of the requested train op. Can be:
-        
-            - "Adam"
-            - "Adagrad"
-            - "RMSprop"
-            - "GradientDescent" or "GD"
-        :return: train op
-        """
-        name_lower = name.lower()
-        if name_lower == "gradient_descent" or name_lower == "gd":
-            if self.train_op_GD is None:
-                raise ValueError("Gradient decent not provided in initialization.")
-            return {"train": self.train_op_GD, "update": self.update_op_GD}
-        elif name_lower == "adam":
-            if self.train_op_Adam is None:
-                raise ValueError("Adam not provided in initialization.")
-            return {"train": self.train_op_Adam, "update": self.update_op_Adam}
-        elif name_lower == "adagrad":
-            if self.train_op_Adagrad is None:
-                raise ValueError("Adagrad decent not provided in initialization.")
-            return {"train": self.train_op_Adagrad, "update": self.update_op_Adagrad}
-        elif name_lower == "rmsprop":
-            if self.train_op_RMSProp is None:
-                raise ValueError("RMSProp decent not provided in initialization.")
-            return {"train": self.train_op_RMSProp, "update": self.update_op_RMSProp}
-        elif name_lower == "bfgs":
-            if self.train_op_bfgs is None:
-                raise ValueError("BFGS not provided in initialization.")
-            return {"train": self.train_op_bfgs, "update": self.update_op_bfgs}
-        elif name_lower.lower() == "newton" or \
-                name_lower.lower() == "newton_raphson" or \
-                name_lower.lower() == "nr":
-            if self.train_op_nr is None:
-                raise ValueError("Newton-rhapson not provided in initialization.")
-            return {"train": self.train_op_nr, "update": self.update_op_nr}
-        elif name_lower.lower() == "newton_tr" or \
-                name_lower.lower() == "newton_raphson_tr" or \
-                name_lower.lower() == "nr_tr":
-            if self.train_op_nr_tr is None:
-                raise ValueError("Newton-rhapson trust-region not provided in initialization.")
-            return {"train": self.train_op_nr_tr, "update": self.update_op_nr_tr}
-        elif name_lower.lower() == "irls" or \
-                name_lower.lower() == "iwls":
-            if self.train_op_irls is None:
-                raise ValueError("IRLS not provided in initialization.")
-            return {"train": self.train_op_irls, "update": self.update_op_irls}
-        elif name_lower.lower() == "irls_gd" or \
-                name_lower.lower() == "iwls_gd":
-            if self.train_op_irls_gd is None:
-                raise ValueError("IRLS_GD not provided in initialization.")
-            return {"train": self.train_op_irls_gd, "update": self.update_op_irls_gd}
-        elif name_lower.lower() == "irls_tr" or \
-                name_lower.lower() == "iwls_tr":
-            if self.train_op_irls_tr is None:
-                raise ValueError("IRLS trust-region not provided in initialization.")
-            return {"train": self.train_op_irls_tr, "update": self.update_op_irls_tr}
-        elif name_lower.lower() == "irls_gd_tr" or \
-             name_lower.lower() == "iwls_gd_tr":
-            if self.train_op_irls_gd_tr is None:
-                raise ValueError("IRLS_GD trust-region not provided in initialization.")
-            return {"train": self.train_op_irls_gd_tr, "update": self.update_op_irls_gd_tr}
-        else:
-                raise ValueError("Unknown optimizer %s" % name)
-
-    def gradient_by_variable(self, variable: tf.Variable):
-        """
-        Returns the gradient to a specific variable if existing in self.gradients
-        :param variable: the variable whose gradient is requested
-        :return: gradient tensor or None if not found
-        """
-        for g, v in self.gradients:
-            if v is variable:
-                return g
-        return None
-
-    def plain_gradient_by_variable(self, variable: tf.Variable):
-        """
-        Returns the plain gradient to a specific variable if existing in self.plain_gradients
-        :param variable: the variable whose gradient is requested
-        :return: gradient tensor or None if not found
-        """
-        for g, v in self.plain_gradients:
-            if v is variable:
-                return g
-        return None
diff --git a/batchglm/unit_test/test_acc_analytic_glm_all.py b/batchglm/unit_test/test_acc_analytic_glm_all.py
deleted file mode 100644
index 57e31e0d..00000000
--- a/batchglm/unit_test/test_acc_analytic_glm_all.py
+++ /dev/null
@@ -1,373 +0,0 @@
-import logging
-import unittest
-import numpy as np
-import scipy.sparse
-
-import batchglm.api as glm
-from batchglm.models.base_glm import _EstimatorGLM, _SimulatorGLM
-
-glm.setup_logging(verbosity="WARNING", stream="STDOUT")
-logger = logging.getLogger(__name__)
-
-
-class _TestAccuracyAnalyticGlmAllEstim():
-
-    estimator: _EstimatorGLM
-    sim: _SimulatorGLM
-    noise_model: str
-
-    def __init__(
-            self,
-            simulator,
-            train_scale,
-            noise_model,
-            sparse,
-            init_a,
-            init_b
-    ):
-        self.sim = simulator
-        self.noise_model = noise_model
-
-        if noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if noise_model == "nb":
-                from batchglm.api.models.tf1.glm_nb import Estimator, InputDataGLM
-            elif noise_model == "norm":
-                from batchglm.api.models import Estimator, InputDataGLM
-            elif noise_model == "beta":
-                from batchglm.api.models.tf1.glm_beta import Estimator, InputDataGLM
-            else:
-                raise ValueError("noise_model not recognized")
-
-        batch_size = 500
-        provide_optimizers = {"gd": True, "adam": True, "adagrad": True, "rmsprop": True,
-                              "nr": False, "nr_tr": False,
-                              "irls": False, "irls_gd": False, "irls_tr": False, "irls_gd_tr": False}
-
-        if sparse:
-            input_data = InputDataGLM(
-                data=scipy.sparse.csr_matrix(simulator.input_data.x),
-                design_loc=simulator.input_data.design_loc,
-                design_scale=simulator.input_data.design_scale
-            )
-        else:
-            input_data = InputDataGLM(
-                data=simulator.input_data.x,
-                design_loc=simulator.input_data.design_loc,
-                design_scale=simulator.input_data.design_scale
-            )
-
-        self.estimator = Estimator(
-            input_data=input_data,
-            batch_size=batch_size,
-            quick_scale=not train_scale,
-            provide_optimizers=provide_optimizers,
-            provide_batched=True,
-            provide_fim=False,
-            provide_hessian=False,
-            init_a=init_a,
-            init_b=init_b
-        )
-
-    def eval_estimation_a(
-            self,
-            init_a,
-    ):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                threshold_dev = 1e-2
-                threshold_std = 1e-1
-            elif self.noise_model == "norm":
-                threshold_dev = 1e-2
-                threshold_std = 1e-1
-            elif self.noise_model == "beta":
-                threshold_dev = 1e-2
-                threshold_std = 1e-1
-            else:
-                raise ValueError("noise_model not recognized")
-
-        if init_a == "standard":
-            mean_dev = np.mean(self.estimator.model.a_var[0, :] - self.sim.a_var[0, :])
-            std_dev = np.std(self.estimator.model.a_var[0, :] - self.sim.a_var[0, :])
-        elif init_a == "closed_form":
-            mean_dev = np.mean(self.estimator.model.a_var - self.sim.a_var)
-            std_dev = np.std(self.estimator.model.a_var - self.sim.a_var)
-        else:
-            assert False
-
-        logging.getLogger("batchglm").info("mean_dev_a %f" % mean_dev)
-        logging.getLogger("batchglm").info("std_dev_a %f" % std_dev)
-
-        if np.abs(mean_dev) < threshold_dev and \
-                std_dev < threshold_std:
-            return True
-        else:
-            return False
-
-    def eval_estimation_b(
-            self,
-            init_b
-    ):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                threshold_dev = 1e-2
-                threshold_std = 1e-1
-            elif self.noise_model == "norm":
-                threshold_dev = 1e-2
-                threshold_std = 1e-1
-            elif self.noise_model == "beta":
-                threshold_dev = 1e-2
-                threshold_std = 1e-1
-            else:
-                raise ValueError("noise_model not recognized")
-
-        if init_b == "standard":
-            mean_dev = np.mean(self.estimator.b_var[0, :] - self.sim.b[0, :])
-            std_dev = np.std(self.estimator.b_var[0, :] - self.sim.b[0, :])
-        elif init_b == "closed_form":
-            mean_dev = np.mean(self.estimator.b_var - self.sim.b)
-            std_dev = np.std(self.estimator.b_var - self.sim.b)
-        else:
-            assert False
-
-        logging.getLogger("batchglm").info("mean_dev_b %f" % mean_dev)
-        logging.getLogger("batchglm").info("std_dev_b %f" % std_dev)
-
-        if np.abs(mean_dev) < threshold_dev and \
-                std_dev < threshold_std:
-            return True
-        else:
-            return False
-
-
-class TestAccuracyAnalyticGlmAll(
-    unittest.TestCase
-):
-    noise_model: str
-
-    def get_simulator(self):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                from batchglm.api.models.tf1.glm_nb import Simulator
-            elif self.noise_model == "norm":
-                from batchglm.api.models import Simulator
-            elif self.noise_model == "beta":
-                from batchglm.api.models.tf1.glm_beta import Simulator
-            else:
-                raise ValueError("noise_model not recognized")
-
-        return Simulator(
-            num_observations=100000,
-            num_features=3
-        )
-
-    def get_estimator(self, train_scale, sparse, init_a, init_b):
-        return _TestAccuracyAnalyticGlmAllEstim(
-            simulator=self.sim,
-            train_scale=train_scale,
-            noise_model=self.noise_model,
-            sparse=sparse,
-            init_a=init_a,
-            init_b=init_b
-        )
-
-    def simulate_complex(self):
-        self.sim = self.get_simulator()
-        self.sim.generate_sample_description(num_batches=1, num_conditions=2)
-
-        def rand_fn_ave(shape):
-            if self.noise_model in ["nb", "norm"]:
-                theta = np.random.uniform(10, 1000, shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.random.uniform(0.1, 0.7, shape)
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        def rand_fn_loc(shape):
-            if self.noise_model in ["nb", "norm"]:
-                theta = np.random.uniform(1, 3, shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.random.uniform(0, 0.15, shape)
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        def rand_fn_scale(shape):
-            theta = np.zeros(shape)
-            if self.noise_model in ["nb"]:
-                theta[0, :] = np.random.uniform(1, 3, shape[1])
-            elif self.noise_model in ["norm"]:
-                theta[0, :] = np.random.uniform(1, 2, shape[1])
-            elif self.noise_model in ["beta"]:
-                theta[0, :] = np.random.uniform(0.2, 0.4, shape[1])
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        self.sim.generate_params(
-            rand_fn_ave=lambda shape: rand_fn_ave(shape),
-            rand_fn_loc=lambda shape: rand_fn_loc(shape),
-            rand_fn_scale=lambda shape: rand_fn_scale(shape)
-        )
-        self.sim.generate_data()
-
-    def simulate_easy(self):
-        self.sim = self.get_simulator()
-        self.sim.generate_sample_description(num_batches=1, num_conditions=1)
-
-        def rand_fn_ave(shape):
-            if self.noise_model in ["nb", "norm"]:
-                theta = np.random.uniform(10, 1000, shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.random.uniform(0.1, 0.9, shape)
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        def rand_fn_loc(shape):
-            return np.ones(shape)
-
-        def rand_fn_scale(shape):
-            theta = np.zeros(shape)
-            if self.noise_model in ["nb"]:
-                theta[0, :] = np.random.uniform(1, 3, shape[1])
-            elif self.noise_model in ["norm"]:
-                theta[0, :] = np.random.uniform(1, 2, shape[1])
-            elif self.noise_model in ["beta"]:
-                theta[0, :] = np.random.uniform(0.2, 0.4, shape[1])
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        self.sim.generate_params(
-            rand_fn_ave=lambda shape: rand_fn_ave(shape),
-            rand_fn_loc=lambda shape: rand_fn_loc(shape),
-            rand_fn_scale=lambda shape: rand_fn_scale(shape)
-        )
-        self.sim.generate_data()
-        assert self.sim.input_data.design_loc.shape[1] == 1, "confounders include in intercept-only simulation"
-        assert self.sim.input_data.design_scale.shape[1] == 1, "confounders include in intercept-only simulation"
-
-    def _test_a_and_b(self, sparse, init_a, init_b):
-        estimator = self.get_estimator(
-            train_scale=False,
-            sparse=sparse,
-            init_a=init_a,
-            init_b=init_b
-        )
-        estimator.estimator.initialize()
-        estimator.estimator.finalize()
-        success = estimator.eval_estimation_a(
-            init_a=init_a,
-        )
-        assert success, "estimation for a_model was inaccurate"
-        success = estimator.eval_estimation_b(
-            init_b=init_b
-        )
-        assert success, "estimation for b_model was inaccurate"
-        return True
-
-
-class TestAccuracyAnalyticGlmNb(
-    TestAccuracyAnalyticGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for negative binomial data.
-    """
-
-    def test_a_closed_b_closed(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyAnalyticGlmNb.test_a_closed_b_closed()")
-
-        np.random.seed(1)
-        self.noise_model = "nb"
-        self.simulate_complex()
-        self._test_a_and_b(sparse=False, init_a="closed_form", init_b="closed_form")
-        self._test_a_and_b(sparse=True, init_a="closed_form", init_b="closed_form")
-
-    def test_a_standard_b_standard(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyAnalyticGlmNb.test_a_standard_b_standard()")
-
-        np.random.seed(1)
-        self.noise_model = "nb"
-        self.simulate_easy()
-        self._test_a_and_b(sparse=False, init_a="standard", init_b="standard")
-        self._test_a_and_b(sparse=True, init_a="standard", init_b="standard")
-
-
-class TestAccuracyAnalyticGlmNorm(
-    TestAccuracyAnalyticGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for normally distributed data.
-    """
-
-    def test_a_closed_b_closed(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyAnalyticGlmNorm.test_a_closed_b_closed()")
-
-        np.random.seed(1)
-        self.noise_model = "norm"
-        self.simulate_complex()
-        self._test_a_and_b(sparse=False, init_a="closed_form", init_b="closed_form")
-        self._test_a_and_b(sparse=True, init_a="closed_form", init_b="closed_form")
-
-    def test_a_standard_b_standard(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyAnalyticGlmNorm.test_a_standard_b_standard()")
-
-        np.random.seed(1)
-        self.noise_model = "norm"
-        self.simulate_easy()
-        self._test_a_and_b(sparse=False, init_a="standard", init_b="standard")
-        self._test_a_and_b(sparse=True, init_a="standard", init_b="standard")
-
-
-class TestAccuracyAnalyticGlmBeta(
-    TestAccuracyAnalyticGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for beta distributed data.
-    """
-
-    def test_a_closed_b_closed(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyAnalyticGlmBeta.test_a_closed_b_closed()")
-
-        np.random.seed(1)
-        self.noise_model = "beta"
-        self.simulate_complex()
-        self._test_a_and_b(sparse=False, init_a="closed_form", init_b="closed_form")
-        self._test_a_and_b(sparse=True, init_a="closed_form", init_b="closed_form")
-
-    def test_a_standard_b_standard(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyAnalyticGlmBeta.test_a_standard_b_standard()")
-
-        np.random.seed(1)
-        self.noise_model = "beta"
-        self.simulate_easy()
-        self._test_a_and_b(sparse=False, init_a="standard", init_b="standard")
-        self._test_a_and_b(sparse=True, init_a="standard", init_b="standard")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/batchglm/unit_test/test_acc_constrained_vglm_all.py b/batchglm/unit_test/test_acc_constrained_vglm_all.py
deleted file mode 100644
index 1723a325..00000000
--- a/batchglm/unit_test/test_acc_constrained_vglm_all.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import logging
-import numpy as np
-import unittest
-
-import batchglm.api as glm
-from batchglm.unit_test.test_acc_glm_all import _TestAccuracyGlmAll
-
-glm.setup_logging(verbosity="WARNING", stream="STDOUT")
-logger = logging.getLogger(__name__)
-
-
-class _TestAccuracyVglmAll(_TestAccuracyGlmAll):
-
-    def simulate(self):
-        super().simulate()
-        # Override design matrix of simulation 1 to encode constraints
-        dmat = np.hstack([
-            self.sim1.input_data.design_loc,
-            np.expand_dims(self.sim1.input_data.design_loc[:, 0] -
-                           self.sim1.input_data.design_loc[:, -1], axis=-1)
-        ])
-        constraints = np.zeros([4, 3])
-        constraints[0, 0] = 1
-        constraints[1, 1] = 1
-        constraints[2, 2] = 1
-        constraints[3, 2] = -1
-        new_coef_names = ['Intercept', 'condition[T.1]', 'batch[1]', 'batch[2]']
-        self.sim1.input_data.design_loc = dmat
-        self.sim1.input_data.design_scale = dmat
-        self.sim1.input_data._design_loc_names = new_coef_names
-        self.sim1.input_data._design_scale_names = new_coef_names
-        self.sim1.input_data.constraints_loc = constraints
-        self.sim1.input_data.constraints_scale = constraints
-
-    def _test_full(self, sparse):
-        self._test_full_a_and_b(sparse=sparse)
-        self._test_full_a_only(sparse=sparse)
-
-    def _test_batched(self, sparse):
-        self._test_batched_a_and_b(sparse=sparse)
-        self._test_batched_a_only(sparse=sparse)
-
-
-class TestAccuracyVglmNb(
-    _TestAccuracyVglmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for negative binomial distributed data.
-    """
-
-    def test_full_nb(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyVglmNb.test_full_nb()")
-
-        np.random.seed(1)
-        self.noise_model = "nb"
-        self.simulate()
-        self._test_full(sparse=False)
-        self._test_full(sparse=True)
-
-    def test_batched_nb(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyVglmNb.test_batched_nb()")
-
-        np.random.seed(1)
-        self.noise_model = "nb"
-        self.simulate()
-        self._test_batched(sparse=False)
-        self._test_batched(sparse=True)
-
-
-class TestAccuracyVglmNorm(
-    _TestAccuracyGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for normal distributed data.
-    # TODO not tested yet.
-    """
-
-    def test_full_norm(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyVglmNorm.test_full_norm()")
-
-        np.random.seed(1)
-        self.noise_model = "norm"
-        self.simulate()
-        self._test_full(sparse=False)
-        self._test_full(sparse=True)
-
-    def test_batched_norm(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyVglmNorm.test_batched_norm()")
-
-        np.random.seed(1)
-        self.noise_model = "norm"
-        self.simulate()
-        self._test_batched(sparse=False)
-        self._test_batched(sparse=True)
-
-
-class TestAccuracyVglmBeta(
-    _TestAccuracyGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for beta distributed data.
-    TODO not working yet.
-    """
-
-    def test_full_beta(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyVglmBeta.test_full_beta()")
-
-        np.random.seed(1)
-        self.noise_model = "beta"
-        self.simulate()
-        self._test_full(sparse=False)
-        self._test_full(sparse=True)
-
-    def test_batched_beta(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyVglmBeta.test_batched_beta()")
-
-        np.random.seed(1)
-        self.noise_model = "beta"
-        self.simulate()
-        self._test_batched(sparse=False)
-        self._test_batched(sparse=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/batchglm/unit_test/test_acc_glm_all.py b/batchglm/unit_test/test_acc_glm_all.py
deleted file mode 100644
index 0dce6a43..00000000
--- a/batchglm/unit_test/test_acc_glm_all.py
+++ /dev/null
@@ -1,528 +0,0 @@
-import logging
-import numpy as np
-import scipy.sparse
-import unittest
-
-import batchglm.api as glm
-
-glm.setup_logging(verbosity="WARNING", stream="STDOUT")
-logger = logging.getLogger(__name__)
-
-
-class _TestAccuracyGlmAllEstim:
-
-    def __init__(
-            self,
-            simulator,
-            quick_scale,
-            noise_model,
-            sparse,
-            init_mode
-    ):
-        if noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if noise_model == "nb":
-                from batchglm.api.models.tf1.glm_nb import Estimator, InputDataGLM
-            elif noise_model == "norm":
-                from batchglm.api.models import Estimator, InputDataGLM
-            elif noise_model == "beta":
-                from batchglm.api.models.tf1.glm_beta import Estimator, InputDataGLM
-            else:
-                raise ValueError("noise_model not recognized")
-
-        batch_size = 2000
-        provide_optimizers = {
-            "gd": True,
-            "adam": True,
-            "adagrad": True,
-            "rmsprop": True,
-            "nr": True,
-            "nr_tr": True,
-            "irls": noise_model in ["nb", "norm"],
-            "irls_gd": noise_model in ["nb", "norm"],
-            "irls_tr": noise_model in ["nb", "norm"],
-            "irls_gd_tr": noise_model in ["nb", "norm"]
-        }
-
-        if sparse:
-            input_data = InputDataGLM(
-                data=scipy.sparse.csr_matrix(simulator.input_data.x),
-                design_loc=simulator.input_data.design_loc,
-                design_scale=simulator.input_data.design_scale,
-                design_loc_names=simulator.input_data.design_loc_names,
-                design_scale_names=simulator.input_data.design_scale_names,
-                constraints_loc=simulator.input_data.constraints_loc,
-                constraints_scale=simulator.input_data.constraints_scale,
-                size_factors=simulator.input_data.size_factors,
-                as_dask=False
-            )
-        else:
-            input_data = InputDataGLM(
-                data=simulator.input_data.x,
-                design_loc=simulator.input_data.design_loc,
-                design_scale=simulator.input_data.design_scale,
-                design_loc_names=simulator.input_data.design_loc_names,
-                design_scale_names=simulator.input_data.design_scale_names,
-                constraints_loc=simulator.input_data.constraints_loc,
-                constraints_scale=simulator.input_data.constraints_scale,
-                size_factors=simulator.input_data.size_factors,
-                as_dask=False
-            )
-
-        self.estimator = Estimator(
-            input_data=input_data,
-            batch_size=batch_size,
-            quick_scale=quick_scale,
-            provide_optimizers=provide_optimizers,
-            provide_batched=True,
-            provide_fim=noise_model in ["nb", "norm"],
-            provide_hessian=True,
-            init_a=init_mode,
-            init_b=init_mode
-        )
-        self.sim = simulator
-
-    def estimate(
-            self,
-            algo,
-            batched,
-            acc,
-            lr
-    ):
-        self.estimator.initialize()
-        self.estimator.train_sequence(training_strategy=[
-            {
-                "learning_rate": lr,
-                "convergence_criteria": "all_converged",
-                "stopping_criteria": acc,
-                "use_batching": batched,
-                "optim_algo": algo,
-            },
-        ])
-
-    def eval_estimation(
-            self,
-            batched,
-            train_loc,
-            train_scale
-    ):
-        if batched:
-            threshold_dev_a = 0.4
-            threshold_dev_b = 0.4
-            threshold_std_a = 2
-            threshold_std_b = 2
-        else:
-            threshold_dev_a = 0.2
-            threshold_dev_b = 0.2
-            threshold_std_a = 1
-            threshold_std_b = 1
-
-        success = True
-        if train_loc:
-            mean_rel_dev_a = np.mean((self.estimator.model.a_var - self.sim.a_var) / self.sim.a_var)
-            std_rel_dev_a = np.std((self.estimator.model.a_var - self.sim.a_var) / self.sim.a_var)
-
-            logging.getLogger("batchglm").info("mean_rel_dev_a %f" % mean_rel_dev_a)
-            logging.getLogger("batchglm").info("std_rel_dev_a %f" % std_rel_dev_a)
-
-            if np.abs(mean_rel_dev_a) > threshold_dev_a or std_rel_dev_a > threshold_std_a:
-                success = False
-        if train_scale:
-            mean_rel_dev_b = np.mean((self.estimator.model.b_var - self.sim.b_var) / self.sim.b_var)
-            std_rel_dev_b = np.std((self.estimator.model.b_var - self.sim.b_var) / self.sim.b_var)
-
-            logging.getLogger("batchglm").info("mean_rel_dev_b %f" % mean_rel_dev_b)
-            logging.getLogger("batchglm").info("std_rel_dev_b %f" % std_rel_dev_b)
-
-            if np.abs(mean_rel_dev_b) > threshold_dev_b or std_rel_dev_b > threshold_std_b:
-                success = False
-
-        return success
-
-
-class _TestAccuracyGlmAll(
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results.
-
-    Accuracy is evaluted via deviation of simulated ground truth.
-    The unit tests test individual training graphs and multiple optimizers
-    (incl. one tensorflow internal optimizer and newton-rhapson)
-    for each training graph. The training graphs tested are as follows:
-
-     - full data model
-        - train a and b model: test_full_global_a_and_b()
-        - train a model only: test_full_global_a_only()
-        - train b model only: test_full_global_b_only()
-    - batched data model
-        - train a and b model: test_batched_global_a_and_b()
-        - train a model only: test_batched_global_a_only()
-        - train b model only: test_batched_global_b_only()
-
-    The unit tests throw an assertion error if the required accurcy is
-    not met. Accuracy thresholds are fairly lenient so that unit_tests
-    pass even with noise inherent in fast optimisation and random
-    initialisation in simulation. Still, large biases (i.e. graph errors)
-    should be discovered here.
-
-    Note on settings by optimised:
-
-    IRLS_TR: Needs slow TR collapse to converge.
-    """
-    noise_model: str
-    optims_tested: dict
-
-    def simulate(self):
-        self.simulate1()
-        self.simulate2()
-
-    def get_simulator(self):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                from batchglm.api.models.tf1.glm_nb import Simulator
-            elif self.noise_model == "norm":
-                from batchglm.api.models import Simulator
-            elif self.noise_model == "beta":
-                from batchglm.api.models.tf1.glm_beta import Simulator
-            else:
-                raise ValueError("noise_model not recognized")
-
-        return Simulator(num_observations=10000, num_features=10)
-
-    def simulate1(self):
-        self.sim1 = self.get_simulator()
-        self.sim1.generate_sample_description(num_batches=2, num_conditions=2)
-
-        def rand_fn_ave(shape):
-            if self.noise_model in ["nb", "norm"]:
-                theta = np.random.uniform(10, 1000, shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.random.uniform(0.1, 0.7, shape)
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        def rand_fn_loc(shape):
-            if self.noise_model in ["nb", "norm"]:
-                theta = np.random.uniform(1, 3, shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.random.uniform(0, 0.15, shape)
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        def rand_fn_scale(shape):
-            if self.noise_model in ["nb"]:
-                theta = np.random.uniform(1, 3, shape)
-            elif self.noise_model in ["norm"]:
-                theta = np.random.uniform(1, 3, shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.random.uniform(0, 0.15, shape)
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        self.sim1.generate_params(
-            rand_fn_ave=lambda shape: rand_fn_ave(shape),
-            rand_fn_loc=lambda shape: rand_fn_loc(shape),
-            rand_fn_scale=lambda shape: rand_fn_scale(shape)
-        )
-        self.sim1.generate_data()
-
-    def simulate2(self):
-        self.sim2 = self.get_simulator()
-        self.sim2.generate_sample_description(num_batches=0, num_conditions=2)
-
-        def rand_fn_ave(shape):
-            if self.noise_model in ["nb", "norm"]:
-                theta = np.random.uniform(10, 1000, shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.random.uniform(0.1, 0.9, shape)
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        def rand_fn_loc(shape):
-            if self.noise_model in ["nb", "norm"]:
-                theta = np.ones(shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.zeros(shape)+0.05
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        def rand_fn_scale(shape):
-            if self.noise_model in ["nb"]:
-                theta = np.ones(shape)
-            elif self.noise_model in ["norm"]:
-                theta = np.ones(shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.ones(shape) - 0.8
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        self.sim2.generate_params(
-            rand_fn_ave=lambda shape: rand_fn_ave(shape),
-            rand_fn_loc=lambda shape: rand_fn_loc(shape),
-            rand_fn_scale=lambda shape: rand_fn_scale(shape)
-        )
-        self.sim2.generate_data()
-
-    def simulator(self, train_loc):
-        if train_loc:
-            return self.sim1
-        else:
-            return self.sim2
-
-    def basic_test(
-            self,
-            batched,
-            train_loc,
-            train_scale,
-            sparse
-    ):
-        self.optims_tested = {
-            "nb": ["ADAM", "IRLS_GD_TR"],
-            "beta": ["NR_TR"],
-            "norm": ["IRLS_TR"]
-        }
-        if self.noise_model in ["norm"]:
-            algos = self.optims_tested["norm"]
-            init_mode = "all_zero"
-            lr = {"ADAM": 1e-3, "NR_TR": 1, "IRLS_TR": 1}
-        elif self.noise_model in ["beta"]:
-            algos = self.optims_tested["beta"]
-            init_mode = "all_zero"
-            if batched:
-                lr = {"ADAM": 0.1, "NR_TR": 1}
-            else:
-                lr = {"ADAM": 1e-5, "NR_TR": 1}
-        elif self.noise_model in ["nb"]:
-            algos = self.optims_tested["nb"]
-            init_mode = "standard"
-            if batched:
-                lr = {"ADAM": 0.1, "IRLS_GD_TR": 1}
-            else:
-                lr = {"ADAM": 0.05, "IRLS_GD_TR": 1}
-        else:
-            raise ValueError("noise model %s not recognized" % self.noise_model)
-
-        for algo in algos:
-            logger.info("algorithm: %s" % algo)
-            if algo in ["ADAM", "RMSPROP", "GD"]:
-                if batched:
-                    acc = 1e-4
-                else:
-                    acc = 1e-6
-                glm.pkg_constants.JACOBIAN_MODE = "analytic"
-            elif algo in ["NR", "NR_TR"]:
-                if batched:
-                    acc = 1e-12
-                else:
-                    acc = 1e-14
-                if self.noise_model in ["beta"]:
-                    glm.pkg_constants.TRUST_REGION_RADIUS_INIT = 1
-                else:
-                    glm.pkg_constants.TRUST_REGION_RADIUS_INIT = 100
-                glm.pkg_constants.TRUST_REGION_T1 = 0.5
-                glm.pkg_constants.TRUST_REGION_T2 = 1.5
-                glm.pkg_constants.CHOLESKY_LSTSQS = True
-                glm.pkg_constants.CHOLESKY_LSTSQS_BATCHED = True
-                glm.pkg_constants.JACOBIAN_MODE = "analytic"
-                glm.pkg_constants.HESSIAN_MODE = "analytic"
-            elif algo in ["IRLS", "IRLS_TR", "IRLS_GD", "IRLS_GD_TR"]:
-                if batched:
-                    acc = 1e-12
-                else:
-                    acc = 1e-14
-                glm.pkg_constants.TRUST_REGION_T1 = 0.5
-                glm.pkg_constants.TRUST_REGION_T2 = 1.5
-                glm.pkg_constants.CHOLESKY_LSTSQS = True
-                glm.pkg_constants.CHOLESKY_LSTSQS_BATCHED = True
-                glm.pkg_constants.JACOBIAN_MODE = "analytic"
-            else:
-                return ValueError("algo %s not recognized" % algo)
-            estimator = _TestAccuracyGlmAllEstim(
-                simulator=self.simulator(train_loc=train_loc),
-                quick_scale=False if train_scale else True,
-                noise_model=self.noise_model,
-                sparse=sparse,
-                init_mode=init_mode
-            )
-            estimator.estimate(
-                algo=algo,
-                batched=batched,
-                acc=acc,
-                lr=lr[algo]
-            )
-            estimator.estimator.finalize()
-            success = estimator.eval_estimation(
-                batched=batched,
-                train_loc=train_loc,
-                train_scale=train_scale,
-            )
-            assert success, "%s did not yield exact results" % algo
-
-        return True
-
-    def _test_full_a_and_b(self, sparse):
-        return self.basic_test(
-            batched=False,
-            train_loc=True,
-            train_scale=True,
-            sparse=sparse
-        )
-
-    def _test_full_a_only(self, sparse):
-        return self.basic_test(
-            batched=False,
-            train_loc=True,
-            train_scale=False,
-            sparse=sparse
-        )
-
-    def _test_full_b_only(self, sparse):
-        return self.basic_test(
-            batched=False,
-            train_loc=False,
-            train_scale=True,
-            sparse=sparse
-        )
-
-    def _test_batched_a_and_b(self, sparse):
-        return self.basic_test(
-            batched=True,
-            train_loc=True,
-            train_scale=True,
-            sparse=sparse
-        )
-
-    def _test_batched_a_only(self, sparse):
-        return self.basic_test(
-            batched=True,
-            train_loc=True,
-            train_scale=False,
-            sparse=sparse
-        )
-
-    def _test_batched_b_only(self, sparse):
-        return self.basic_test(
-            batched=True,
-            train_loc=False,
-            train_scale=True,
-            sparse=sparse
-        )
-
-    def _test_full(self, sparse):
-        self._test_full_a_and_b(sparse=sparse)
-        self._test_full_a_only(sparse=sparse)
-        self._test_full_b_only(sparse=sparse)
-
-    def _test_batched(self, sparse):
-        self._test_batched_a_and_b(sparse=sparse)
-        self._test_batched_a_only(sparse=sparse)
-        self._test_batched_b_only(sparse=sparse)
-
-
-class TestAccuracyGlmNb(
-    _TestAccuracyGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for negative binomial distributed data.
-    """
-
-    def test_full_nb(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNb.test_full_nb()")
-
-        np.random.seed(1)
-        self.noise_model = "nb"
-        self.simulate()
-        self._test_full(sparse=False)
-        self._test_full(sparse=True)
-
-    def test_batched_nb(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNb.test_batched_nb()")
-
-        np.random.seed(1)
-        self.noise_model = "nb"
-        self.simulate()
-        self._test_batched(sparse=False)
-        self._test_batched(sparse=True)
-
-
-class TestAccuracyGlmNorm(
-    _TestAccuracyGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for normal distributed data.
-    """
-
-    def test_full_norm(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNorm.test_full_norm()")
-
-        np.random.seed(1)
-        self.noise_model = "norm"
-        self.simulate()
-        self._test_full(sparse=False)
-        self._test_full(sparse=True)
-
-    def test_batched_norm(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNorm.test_batched_norm()")
-        # TODO not working yet.
-
-        np.random.seed(1)
-        self.noise_model = "norm"
-        self.simulate()
-        self._test_batched(sparse=False)
-        self._test_batched(sparse=True)
-
-
-class TestAccuracyGlmBeta(
-    _TestAccuracyGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for beta distributed data.
-    TODO not working yet.
-    """
-
-    def test_full_beta(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmBeta.test_full_beta()")
-
-        np.random.seed(1)
-        self.noise_model = "beta"
-        self.simulate()
-        self._test_full(sparse=False)
-        self._test_full(sparse=True)
-
-    def test_batched_beta(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmBeta.test_batched_beta()")
-
-        np.random.seed(1)
-        self.noise_model = "beta"
-        self.simulate()
-        self._test_batched(sparse=False)
-        self._test_batched(sparse=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/batchglm/unit_test/test_acc_glm_all_tf2.py b/batchglm/unit_test/test_acc_glm_all_tf2.py
deleted file mode 100644
index f4ab16fb..00000000
--- a/batchglm/unit_test/test_acc_glm_all_tf2.py
+++ /dev/null
@@ -1,524 +0,0 @@
-import logging
-import numpy as np
-import scipy.sparse
-import unittest
-
-import batchglm.api as glm
-
-glm.setup_logging(verbosity="WARNING", stream="STDOUT")
-logger = logging.getLogger(__name__)
-
-
-class _TestAccuracyGlmAllEstim:
-
-    def __init__(
-            self,
-            simulator,
-            quick_scale,
-            noise_model,
-            sparse,
-            init_mode
-    ):
-        if noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if noise_model == "nb":
-                from batchglm.api.models.glm_nb import Estimator, InputDataGLM
-            elif noise_model == "norm":
-                from batchglm.api.models.glm_norm import Estimator, InputDataGLM
-            elif noise_model == "beta":
-                from batchglm.api.models.glm_beta import Estimator, InputDataGLM
-            else:
-                raise ValueError("noise_model not recognized")
-
-        batch_size = 2000
-        provide_optimizers = {
-            "gd": True,
-            "adam": True,
-            "adagrad": True,
-            "rmsprop": True,
-            "nr": True,
-            "nr_tr": True,
-            "irls": noise_model in ["nb", "norm"],
-            "irls_gd": noise_model in ["nb", "norm"],
-            "irls_tr": noise_model in ["nb", "norm"],
-            "irls_gd_tr": noise_model in ["nb", "norm"]
-        }
-
-        if sparse:
-            input_data = InputDataGLM(
-                data=scipy.sparse.csr_matrix(simulator.input_data.x),
-                design_loc=simulator.input_data.design_loc,
-                design_scale=simulator.input_data.design_scale,
-                constraints_loc=simulator.input_data.constraints_loc,
-                constraints_scale=simulator.input_data.constraints_scale,
-                size_factors=simulator.input_data.size_factors
-            )
-        else:
-            input_data = InputDataGLM(
-                data=simulator.input_data.x,
-                design_loc=simulator.input_data.design_loc,
-                design_scale=simulator.input_data.design_scale,
-                constraints_loc=simulator.input_data.constraints_loc,
-                constraints_scale=simulator.input_data.constraints_scale,
-                size_factors=simulator.input_data.size_factors
-            )
-
-        self.estimator = Estimator(
-            input_data=input_data,
-            #batch_size=batch_size,
-            quick_scale=quick_scale,
-            #provide_optimizers=provide_optimizers,
-            #provide_batched=True,
-            #provide_fim=noise_model in ["nb", "norm"],
-            #provide_hessian=True,
-            init_a=init_mode,
-            init_b=init_mode
-        )
-        self.sim = simulator
-
-    def estimate(
-            self,
-            algo,
-            batched,
-            acc,
-            lr
-    ):
-        self.estimator.initialize()
-        self.estimator.train_sequence(training_strategy=[
-            {
-                "learning_rate": lr,
-                "convergence_criteria": "all_converged",
-                "stopping_criteria": acc,
-                "use_batching": batched,
-                "optim_algo": algo,
-                "featurewise": False
-            },
-        ])
-
-    def eval_estimation(
-            self,
-            batched,
-            train_loc,
-            train_scale
-    ):
-        if batched:
-            threshold_dev_a = 0.4
-            threshold_dev_b = 0.4
-            threshold_std_a = 2
-            threshold_std_b = 2
-        else:
-            threshold_dev_a = 0.2
-            threshold_dev_b = 0.2
-            threshold_std_a = 1
-            threshold_std_b = 1
-
-        success = True
-        if train_loc:
-            mean_rel_dev_a = np.mean((self.estimator.model.a_var - self.sim.a_var) / self.sim.a_var)
-            std_rel_dev_a = np.std((self.estimator.model.a_var - self.sim.a_var) / self.sim.a_var)
-
-            logging.getLogger("batchglm").info("mean_rel_dev_a %f" % mean_rel_dev_a)
-            logging.getLogger("batchglm").info("std_rel_dev_a %f" % std_rel_dev_a)
-
-            if np.abs(mean_rel_dev_a) > threshold_dev_a or std_rel_dev_a > threshold_std_a:
-                success = False
-        if train_scale:
-            mean_rel_dev_b = np.mean((self.estimator.model.b_var - self.sim.b_var) / self.sim.b_var)
-            std_rel_dev_b = np.std((self.estimator.model.b_var - self.sim.b_var) / self.sim.b_var)
-
-            logging.getLogger("batchglm").info("mean_rel_dev_b %f" % mean_rel_dev_b)
-            logging.getLogger("batchglm").info("std_rel_dev_b %f" % std_rel_dev_b)
-
-            if np.abs(mean_rel_dev_b) > threshold_dev_b or std_rel_dev_b > threshold_std_b:
-                success = False
-
-        return success
-
-
-class _TestAccuracyGlmAll(
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results.
-
-    Accuracy is evaluted via deviation of simulated ground truth.
-    The unit tests test individual training graphs and multiple optimizers
-    (incl. one tensorflow internal optimizer and newton-rhapson)
-    for each training graph. The training graphs tested are as follows:
-
-     - full data model
-        - train a and b model: test_full_global_a_and_b()
-        - train a model only: test_full_global_a_only()
-        - train b model only: test_full_global_b_only()
-    - batched data model
-        - train a and b model: test_batched_global_a_and_b()
-        - train a model only: test_batched_global_a_only()
-        - train b model only: test_batched_global_b_only()
-
-    The unit tests throw an assertion error if the required accurcy is
-    not met. Accuracy thresholds are fairly lenient so that unit_tests
-    pass even with noise inherent in fast optimisation and random
-    initialisation in simulation. Still, large biases (i.e. graph errors)
-    should be discovered here.
-
-    Note on settings by optimised:
-
-    IRLS_TR: Needs slow TR collapse to converge.
-    """
-    noise_model: str
-    optims_tested: dict
-
-    def simulate(self):
-        self.simulate1()
-        self.simulate2()
-
-    def get_simulator(self):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                from batchglm.api.models.glm_nb import Simulator
-            elif self.noise_model == "norm":
-                from batchglm.api.models.glm_norm import Simulator
-            elif self.noise_model == "beta":
-                from batchglm.api.models.glm_beta import Simulator
-            else:
-                raise ValueError("noise_model not recognized")
-
-        return Simulator(num_observations=10000, num_features=10)
-
-    def simulate1(self):
-        self.sim1 = self.get_simulator()
-        self.sim1.generate_sample_description(num_batches=2, num_conditions=2)
-
-        def rand_fn_ave(shape):
-            if self.noise_model in ["nb", "norm"]:
-                theta = np.random.uniform(10, 1000, shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.random.uniform(0.1, 0.7, shape)
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        def rand_fn_loc(shape):
-            if self.noise_model in ["nb", "norm"]:
-                theta = np.random.uniform(1, 3, shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.random.uniform(0, 0.15, shape)
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        def rand_fn_scale(shape):
-            if self.noise_model in ["nb"]:
-                theta = np.random.uniform(1, 3, shape)
-            elif self.noise_model in ["norm"]:
-                theta = np.random.uniform(1, 3, shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.random.uniform(0, 0.15, shape)
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        self.sim1.generate_params(
-            rand_fn_ave=lambda shape: rand_fn_ave(shape),
-            rand_fn_loc=lambda shape: rand_fn_loc(shape),
-            rand_fn_scale=lambda shape: rand_fn_scale(shape)
-        )
-        self.sim1.generate_data()
-
-    def simulate2(self):
-        self.sim2 = self.get_simulator()
-        self.sim2.generate_sample_description(num_batches=0, num_conditions=2)
-
-        def rand_fn_ave(shape):
-            if self.noise_model in ["nb", "norm"]:
-                theta = np.random.uniform(10, 1000, shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.random.uniform(0.1, 0.9, shape)
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        def rand_fn_loc(shape):
-            if self.noise_model in ["nb", "norm"]:
-                theta = np.ones(shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.zeros(shape)+0.05
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        def rand_fn_scale(shape):
-            if self.noise_model in ["nb"]:
-                theta = np.ones(shape)
-            elif self.noise_model in ["norm"]:
-                theta = np.ones(shape)
-            elif self.noise_model in ["beta"]:
-                theta = np.ones(shape) - 0.8
-            else:
-                raise ValueError("noise model not recognized")
-            return theta
-
-        self.sim2.generate_params(
-            rand_fn_ave=lambda shape: rand_fn_ave(shape),
-            rand_fn_loc=lambda shape: rand_fn_loc(shape),
-            rand_fn_scale=lambda shape: rand_fn_scale(shape)
-        )
-        self.sim2.generate_data()
-
-    def simulator(self, train_loc):
-        if train_loc:
-            return self.sim1
-        else:
-            return self.sim2
-
-    def basic_test(
-            self,
-            batched,
-            train_loc,
-            train_scale,
-            sparse
-    ):
-        self.optims_tested = {
-            "nb": ["ADAM", "IRLS_GD_TR"],
-            "beta": ["NR_TR"],
-            "norm": ["IRLS_TR"]
-        }
-        if self.noise_model in ["norm"]:
-            algos = self.optims_tested["norm"]
-            init_mode = "all_zero"
-            lr = {"ADAM": 1e-3, "NR_TR": 1, "IRLS_TR": 1}
-        elif self.noise_model in ["beta"]:
-            algos = self.optims_tested["beta"]
-            init_mode = "all_zero"
-            if batched:
-                lr = {"ADAM": 0.1, "NR_TR": 1}
-            else:
-                lr = {"ADAM": 1e-5, "NR_TR": 1}
-        elif self.noise_model in ["nb"]:
-            algos = self.optims_tested["nb"]
-            init_mode = "standard"
-            if batched:
-                lr = {"ADAM": 0.1, "IRLS_GD_TR": 1}
-            else:
-                lr = {"ADAM": 0.05, "IRLS_GD_TR": 1}
-        else:
-            raise ValueError("noise model %s not recognized" % self.noise_model)
-
-        for algo in algos:
-            logger.info("algorithm: %s" % algo)
-            if algo in ["ADAM", "RMSPROP", "GD"]:
-                if batched:
-                    acc = 1e-4
-                else:
-                    acc = 1e-6
-                glm.pkg_constants.JACOBIAN_MODE = "analytic"
-            elif algo in ["NR", "NR_TR"]:
-                if batched:
-                    acc = 1e-12
-                else:
-                    acc = 1e-14
-                if self.noise_model in ["beta"]:
-                    glm.pkg_constants.TRUST_REGION_RADIUS_INIT = 1
-                else:
-                    glm.pkg_constants.TRUST_REGION_RADIUS_INIT = 100
-                glm.pkg_constants.TRUST_REGION_T1 = 0.5
-                glm.pkg_constants.TRUST_REGION_T2 = 1.5
-                glm.pkg_constants.CHOLESKY_LSTSQS = True
-                glm.pkg_constants.CHOLESKY_LSTSQS_BATCHED = True
-                glm.pkg_constants.JACOBIAN_MODE = "analytic"
-                glm.pkg_constants.HESSIAN_MODE = "analytic"
-            elif algo in ["IRLS", "IRLS_TR", "IRLS_GD", "IRLS_GD_TR"]:
-                if batched:
-                    acc = 1e-12
-                else:
-                    acc = 1e-14
-                glm.pkg_constants.TRUST_REGION_T1 = 0.5
-                glm.pkg_constants.TRUST_REGION_T2 = 1.5
-                glm.pkg_constants.CHOLESKY_LSTSQS = True
-                glm.pkg_constants.CHOLESKY_LSTSQS_BATCHED = True
-                glm.pkg_constants.JACOBIAN_MODE = "analytic"
-            else:
-                return ValueError("algo %s not recognized" % algo)
-            estimator = _TestAccuracyGlmAllEstim(
-                simulator=self.simulator(train_loc=train_loc),
-                quick_scale=False if train_scale else True,
-                noise_model=self.noise_model,
-                sparse=sparse,
-                init_mode=init_mode
-            )
-            estimator.estimate(
-                algo=algo,
-                batched=batched,
-                acc=acc,
-                lr=lr[algo]
-            )
-            estimator.estimator.finalize()
-            success = estimator.eval_estimation(
-                batched=batched,
-                train_loc=train_loc,
-                train_scale=train_scale,
-            )
-            assert success, "%s did not yield exact results" % algo
-
-        return True
-
-    def _test_full_a_and_b(self, sparse):
-        return self.basic_test(
-            batched=False,
-            train_loc=True,
-            train_scale=True,
-            sparse=sparse
-        )
-
-    def _test_full_a_only(self, sparse):
-        return self.basic_test(
-            batched=False,
-            train_loc=True,
-            train_scale=False,
-            sparse=sparse
-        )
-
-    def _test_full_b_only(self, sparse):
-        return self.basic_test(
-            batched=False,
-            train_loc=False,
-            train_scale=True,
-            sparse=sparse
-        )
-
-    def _test_batched_a_and_b(self, sparse):
-        return self.basic_test(
-            batched=True,
-            train_loc=True,
-            train_scale=True,
-            sparse=sparse
-        )
-
-    def _test_batched_a_only(self, sparse):
-        return self.basic_test(
-            batched=True,
-            train_loc=True,
-            train_scale=False,
-            sparse=sparse
-        )
-
-    def _test_batched_b_only(self, sparse):
-        return self.basic_test(
-            batched=True,
-            train_loc=False,
-            train_scale=True,
-            sparse=sparse
-        )
-
-    def _test_full(self, sparse):
-        self._test_full_a_and_b(sparse=sparse)
-        self._test_full_a_only(sparse=sparse)
-        self._test_full_b_only(sparse=sparse)
-
-    def _test_batched(self, sparse):
-        self._test_batched_a_and_b(sparse=sparse)
-        self._test_batched_a_only(sparse=sparse)
-        self._test_batched_b_only(sparse=sparse)
-
-
-class TestAccuracyGlmNb(
-    _TestAccuracyGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for negative binomial distributed data.
-    """
-
-    def test_full_nb(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNb.test_full_nb()")
-
-        np.random.seed(1)
-        self.noise_model = "nb"
-        self.simulate()
-        self._test_full(sparse=False)
-        self._test_full(sparse=True)
-
-"""
-    def test_batched_nb(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNb.test_batched_nb()")
-
-        np.random.seed(1)
-        self.noise_model = "nb"
-        self.simulate()
-        self._test_batched(sparse=False)
-        self._test_batched(sparse=True)
-"""
-"""
-class TestAccuracyGlmNorm(
-    _TestAccuracyGlmAll,
-    unittest.TestCase
-):
-
-    Test whether optimizers yield exact results for normal distributed data.
-
-
-    def test_full_norm(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNorm.test_full_norm()")
-
-        np.random.seed(1)
-        self.noise_model = "norm"
-        self.simulate()
-        self._test_full(sparse=False)
-        self._test_full(sparse=True)
-
-    def test_batched_norm(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNorm.test_batched_norm()")
-        # TODO not working yet.
-
-        np.random.seed(1)
-        self.noise_model = "norm"
-        self.simulate()
-        self._test_batched(sparse=False)
-        self._test_batched(sparse=True)
-
-
-class TestAccuracyGlmBeta(
-    _TestAccuracyGlmAll,
-    unittest.TestCase
-):
-
-    Test whether optimizers yield exact results for beta distributed data.
-    TODO not working yet.
-
-
-    def test_full_beta(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmBeta.test_full_beta()")
-
-        np.random.seed(1)
-        self.noise_model = "beta"
-        self.simulate()
-        self._test_full(sparse=False)
-        self._test_full(sparse=True)
-
-    def test_batched_beta(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmBeta.test_batched_beta()")
-
-        np.random.seed(1)
-        self.noise_model = "beta"
-        self.simulate()
-        self._test_batched(sparse=False)
-        self._test_batched(sparse=True)
-"""
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/batchglm/unit_test/test_acc_sizefactors_glm_all.py b/batchglm/unit_test/test_acc_sizefactors_glm_all.py
deleted file mode 100644
index e4bfb814..00000000
--- a/batchglm/unit_test/test_acc_sizefactors_glm_all.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import logging
-import numpy as np
-import unittest
-
-import batchglm.api as glm
-from batchglm.unit_test.test_acc_glm_all import _TestAccuracyGlmAll
-
-glm.setup_logging(verbosity="WARNING", stream="STDOUT")
-logger = logging.getLogger(__name__)
-
-
-class _TestAccuracyGlmAllSf(_TestAccuracyGlmAll):
-
-    def simulate(self):
-        super().simulate()
-        # Add size factors into input data: Do not centre at 1 so that they bias MAD if something is off.
-        self.sim1.input_data.size_factors = np.random.uniform(1.5, 2., size=self.sim1.input_data.num_observations)
-
-    def _test_full(self, sparse):
-        self._test_full_a_and_b(sparse=sparse)
-
-    def _test_batched(self, sparse):
-        self._test_batched_a_and_b(sparse=sparse)
-
-
-class TestAccuracyGlmNbSf(
-    _TestAccuracyGlmAllSf,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for negative binomial distributed data.
-    """
-
-    def test_full_nb(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNbSf.test_full_nb()")
-
-        np.random.seed(1)
-        self.noise_model = "nb"
-        self.simulate()
-        self._test_full(sparse=False)
-        self._test_full(sparse=True)
-
-    def test_batched_nb(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNbSf.test_batched_nb()")
-
-        np.random.seed(1)
-        self.noise_model = "nb"
-        self.simulate()
-        self._test_batched(sparse=False)
-        self._test_batched(sparse=True)
-
-
-class TestAccuracyGlmNormSf(
-    _TestAccuracyGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for normal distributed data.
-    # TODO not tested yet.
-    """
-
-    def test_full_norm(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNormSf.test_full_norm()")
-
-        np.random.seed(1)
-        self.noise_model = "norm"
-        self.simulate()
-        self._test_full(sparse=False)
-        self._test_full(sparse=True)
-
-    def test_batched_norm(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestAccuracyGlmNormSf.test_batched_norm()")
-
-        np.random.seed(1)
-        self.noise_model = "norm"
-        self.simulate()
-        self._test_batched(sparse=False)
-        self._test_batched(sparse=True)
-
-
-class TestAccuracyGlmBetaSf(
-    _TestAccuracyGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for beta distributed data.
-    Note: size factors are note implemented for beta distribution.
-    """
-
-    def test_dummy(self):
-        return True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/batchglm/unit_test/test_hessians_glm_all.py b/batchglm/unit_test/test_hessians_glm_all.py
deleted file mode 100644
index 1d0dbf36..00000000
--- a/batchglm/unit_test/test_hessians_glm_all.py
+++ /dev/null
@@ -1,187 +0,0 @@
-import logging
-import unittest
-import time
-import numpy as np
-import scipy.sparse
-
-import batchglm.data as data_utils
-import batchglm.pkg_constants as pkg_constants
-
-from batchglm.models.base_glm import InputDataGLM
-
-
-class Test_Hessians_GLM_ALL(unittest.TestCase):
-    noise_model: str
-
-    def setUp(self):
-        pass
-
-    def tearDown(self):
-        pass
-
-    def simulate(self):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                from batchglm.api.models.tf1.glm_nb import Simulator
-            elif self.noise_model == "norm":
-                from batchglm.api.models import Simulator
-            elif self.noise_model == "beta":
-                from batchglm.api.models.tf1.glm_beta import Simulator
-            else:
-                raise ValueError("noise_model not recognized")
-
-        num_observations = 500
-        sim = Simulator(num_observations=num_observations, num_features=4)
-        sim.generate_sample_description(num_conditions=2, num_batches=2)
-        sim.generate()
-
-        self.sim = sim
-
-    def get_hessians(
-            self,
-            input_data: InputDataGLM
-    ):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                from batchglm.api.models.tf1.glm_nb import Estimator
-            elif self.noise_model == "norm":
-                from batchglm.api.models import Estimator
-            elif self.noise_model == "beta":
-                from batchglm.api.models.tf1.glm_beta import Estimator
-            else:
-                raise ValueError("noise_model not recognized")
-
-        provide_optimizers = {"gd": True, "adam": True, "adagrad": True, "rmsprop": True,
-                              "nr": False, "nr_tr": False,
-                              "irls": False, "irls_gd": False, "irls_tr": False, "irls_gd_tr": False}
-
-        estimator = Estimator(
-            input_data=input_data,
-            quick_scale=False,
-            provide_optimizers=provide_optimizers,
-            provide_fim=False,
-            provide_hessian=False,
-            init_a="standard",
-            init_b="standard"
-        )
-        estimator.initialize()
-        estimator_store = estimator.finalize()
-
-        return - estimator_store.fisher_inv
-
-    def _test_compute_hessians(self, sparse):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model=="nb":
-                from batchglm.api.models.tf1.glm_nb import Simulator, InputDataGLM
-            elif self.noise_model == "norm":
-                from batchglm.api.models import Simulator, InputDataGLM
-            elif self.noise_model == "beta":
-                from batchglm.api.models.tf1.glm_beta import Simulator, InputDataGLM
-            else:
-                raise ValueError("noise_model not recognized")
-
-        num_observations = 500
-        num_conditions = 2
-
-        sim = Simulator(num_observations=num_observations, num_features=4)
-        sim.generate_sample_description(num_conditions=num_conditions, num_batches=2)
-        sim.generate()
-
-        sample_description = data_utils.sample_description_from_xarray(sim.data, dim="observations")
-        design_loc = data_utils.design_matrix(sample_description, formula="~ 1 + condition + batch")
-        design_scale = data_utils.design_matrix(sample_description, formula="~ 1 + condition")
-
-        if sparse:
-            input_data = InputDataGLM(
-                data=scipy.sparse.csr_matrix(sim.X),
-                design_loc=design_loc,
-                design_scale=design_scale
-            )
-        else:
-            input_data = InputDataGLM(
-                data=sim.X,
-                design_loc=design_loc,
-                design_scale=design_scale
-            )
-
-        # Compute hessian based on analytic solution.
-        pkg_constants.HESSIAN_MODE = "analytic"
-        t0_analytic = time.time()
-        h_analytic = self.get_hessians(input_data)
-        t1_analytic = time.time()
-        t_analytic = t1_analytic - t0_analytic
-
-        # Compute hessian based on tensorflow auto-differentiation.
-        pkg_constants.HESSIAN_MODE = "tf1"
-        t0_tf = time.time()
-        h_tf = self.get_hessians(input_data)
-        t1_tf = time.time()
-        t_tf = t1_tf - t0_tf
-
-        logging.getLogger("batchglm").info("run time observation batch-wise analytic solution: %f" % t_analytic)
-        logging.getLogger("batchglm").info("run time tensorflow solution: %f" % t_tf)
-        logging.getLogger("batchglm").info("MAD: %f" % np.max(np.abs((h_tf - h_analytic))))
-
-        #i = 1
-        #print(h_tf[i, :, :])
-        #print(h_analytic[i, :, :])
-        #print(h_tf[i, :, :] - h_analytic[i, :, :])
-
-        # Make sure that hessians are not all zero which might make evaluation of equality difficult.
-        assert np.sum(np.abs(h_analytic)) > 1e-10, \
-            "hessians too small to perform test: %f" % np.sum(np.abs(h_analytic))
-        mad = np.max(np.abs(h_tf - h_analytic))
-        assert mad < 1e-15, mad
-        return True
-
-
-class Test_Hessians_GLM_NB(Test_Hessians_GLM_ALL, unittest.TestCase):
-
-    def test_compute_hessians_nb(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.WARNING)
-        logging.getLogger("batchglm").error("Test_Hessians_GLM_NB.test_compute_hessians_nb()")
-
-        self.noise_model = "nb"
-        self._test_compute_hessians(sparse=False)
-        #self._test_compute_hessians(sparse=False)  # TODO tf1>=1.13 waiting for tf1.sparse.expand_dims to work
-
-        return True
-
-
-class Test_Hessians_GLM_NORM(Test_Hessians_GLM_ALL, unittest.TestCase):
-
-    def test_compute_hessians_norm(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.WARNING)
-        logging.getLogger("batchglm").error("Test_Hessians_GLM_NORM.test_compute_hessians_norm()")
-
-        self.noise_model = "norm"
-        self._test_compute_hessians(sparse=False)
-        #self._test_compute_hessians(sparse=False)  # TODO tf1>=1.13 waiting for tf1.sparse.expand_dims to work
-
-        return True
-
-
-class Test_Hessians_GLM_BETA(Test_Hessians_GLM_ALL, unittest.TestCase):
-
-    def test_compute_hessians_beta(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.WARNING)
-        logging.getLogger("batchglm").error("Test_Hessians_GLM_BETA.test_compute_hessians_beta()")
-
-        self.noise_model = "beta"
-        self._test_compute_hessians(sparse=False)
-        #self._test_compute_hessians(sparse=False)  # TODO tf1>=1.13 waiting for tf1.sparse.expand_dims to work
-
-        return True
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/batchglm/unit_test/test_jacobians_glm_all.py b/batchglm/unit_test/test_jacobians_glm_all.py
deleted file mode 100644
index 1605b19a..00000000
--- a/batchglm/unit_test/test_jacobians_glm_all.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import logging
-import unittest
-import time
-import numpy as np
-import scipy.sparse
-
-import batchglm.data as data_utils
-import batchglm.pkg_constants as pkg_constants
-
-from batchglm.models.base_glm import InputDataGLM
-
-
-class Test_Jacobians_GLM_ALL(unittest.TestCase):
-    noise_model: str
-
-    def setUp(self):
-        pass
-
-    def tearDown(self):
-        pass
-
-    def simulate(self):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                from batchglm.api.models.tf1.glm_nb import Simulator
-            elif self.noise_model == "norm":
-                from batchglm.api.models import Simulator
-            elif self.noise_model == "beta":
-                from batchglm.api.models.tf1.glm_beta import Simulator
-            else:
-                raise ValueError("noise_model not recognized")
-
-        num_observations = 500
-        sim = Simulator(num_observations=num_observations, num_features=4)
-        sim.generate_sample_description(num_conditions=2, num_batches=2)
-        sim.generate()
-
-        self.sim = sim
-
-    def get_jacs(
-            self,
-            input_data: InputDataGLM
-    ):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                from batchglm.api.models.tf1.glm_nb import Estimator
-            elif self.noise_model == "norm":
-                from batchglm.api.models import Estimator
-            elif self.noise_model == "beta":
-                from batchglm.api.models.tf1.glm_beta import Estimator
-            else:
-                raise ValueError("noise_model not recognized")
-
-        provide_optimizers = {"gd": True, "adam": True, "adagrad": True, "rmsprop": True,
-                              "nr": False, "nr_tr": False,
-                              "irls": False, "irls_gd": False, "irls_tr": False, "irls_gd_tr": False}
-
-        estimator = Estimator(
-            input_data=input_data,
-            quick_scale=False,
-            provide_optimizers=provide_optimizers,
-            provide_fim=False,
-            provide_hessian=False,
-            init_a="standard",
-            init_b="standard"
-        )
-        estimator.initialize()
-        # Do not train, evaluate at initialization!
-        estimator.train_sequence(training_strategy=[
-            {
-                "convergence_criteria": "step",
-                "stopping_criteria": 0,
-                "use_batching": False,
-                "optim_algo": "gd",
-                "train_mu": False,
-                "train_r": False
-            },
-        ])
-        estimator_store = estimator.finalize()
-        return estimator_store.gradients.values
-
-    def compare_jacs(
-            self,
-            design,
-            sparse
-    ):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model=="nb":
-                from batchglm.api.models.tf1.glm_nb import InputDataGLM
-            elif self.noise_model == "norm":
-                from batchglm.api.models import InputDataGLM
-            elif self.noise_model == "beta":
-                from batchglm.api.models.tf1.glm_beta import InputDataGLM
-            else:
-                raise ValueError("noise_model not recognized")
-
-        sample_description = data_utils.sample_description_from_xarray(self.sim.data, dim="observations")
-        design_loc = data_utils.design_matrix(sample_description, formula=design)
-        design_scale = data_utils.design_matrix(sample_description, formula=design)
-
-        if sparse:
-            input_data = InputDataGLM(
-                data=scipy.sparse.csr_matrix(self.sim.X),
-                design_loc=design_loc,
-                design_scale=design_scale
-            )
-        else:
-            input_data = InputDataGLM(
-                data=self.sim.X,
-                design_loc=design_loc,
-                design_scale=design_scale
-            )
-
-        logging.getLogger("batchglm").debug("** Running analytic Jacobian test")
-        pkg_constants.JACOBIAN_MODE = "analytic"
-        t0_analytic = time.time()
-        J_analytic = self.get_jacs(input_data)
-        t1_analytic = time.time()
-        t_analytic = t1_analytic - t0_analytic
-
-        logging.getLogger("batchglm").debug("** Running tensorflow Jacobian test")
-        pkg_constants.JACOBIAN_MODE = "tf1"
-        t0_tf = time.time()
-        J_tf = self.get_jacs(input_data)
-        t1_tf = time.time()
-        t_tf = t1_tf - t0_tf
-
-        # Make sure that jacobians are not all zero which might make evaluation of equality difficult.
-        assert np.sum(np.abs(J_analytic)) > 1e-10, \
-            "jacobians too small to perform test: %f" % np.sum(np.abs(J_analytic))
-
-        logging.getLogger("batchglm").info("run time tensorflow solution: %f" % t_tf)
-        logging.getLogger("batchglm").info("run time observation batch-wise analytic solution: %f" % t_analytic)
-        logging.getLogger("batchglm").info("MAD: %f" % np.max(np.abs((J_tf - J_analytic))))
-        logging.getLogger("batchglm").info("MRAD: %f" % np.max(np.abs((J_tf - J_analytic) / J_tf)))
-
-        #print(J_tf)
-        #print(J_analytic)
-        #print((J_tf - J_analytic) / J_tf)
-
-        mrad = np.max(np.abs((J_tf - J_analytic) / J_tf))
-        assert mrad < 1e-12, mrad
-        return True
-
-    def _test_compute_jacobians(self, sparse):
-        self.simulate()
-        self.compare_jacs(design="~ 1 + condition + batch", sparse=sparse)
-
-
-class Test_Jacobians_GLM_NB(Test_Jacobians_GLM_ALL, unittest.TestCase):
-
-    def test_compute_jacobians_nb(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logging.getLogger("batchglm").error("Test_Jacobians_GLM_NB.test_compute_jacobians_nb()")
-
-        self.noise_model = "nb"
-        self._test_compute_jacobians(sparse=False)
-        #self._test_compute_jacobians(sparse=True)  #TODO automatic differentiation does not seems to work here yet.
-
-
-class Test_Jacobians_GLM_NORM(Test_Jacobians_GLM_ALL, unittest.TestCase):
-
-    def test_compute_jacobians_norm(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logging.getLogger("batchglm").error("Test_Jacobians_GLM_NORM.test_compute_jacobians_norm()")
-
-        self.noise_model = "norm"
-        self._test_compute_jacobians(sparse=False)
-        #self._test_compute_jacobians(sparse=True)  #TODO automatic differentiation does not seem to work here yet.
-
-class Test_Jacobians_GLM_BETA(Test_Jacobians_GLM_ALL, unittest.TestCase):
-
-    def test_compute_jacobians_beta(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logging.getLogger("batchglm").error("Test_Jacobians_GLM_BETA.test_compute_jacobians_beta()")
-
-        self.noise_model = "beta"
-        self._test_compute_jacobians(sparse=False)
-        #self._test_compute_jacobians(sparse=True)  #TODO automatic differentiation does not seem to work here yet.
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/batchglm/unit_test/test_jacobians_glm_all_tf2.py b/batchglm/unit_test/test_jacobians_glm_all_tf2.py
deleted file mode 100644
index 5bb329dd..00000000
--- a/batchglm/unit_test/test_jacobians_glm_all_tf2.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import logging
-import unittest
-import time
-import numpy as np
-import scipy.sparse
-
-import batchglm.api as glm
-import batchglm.data as data_utils
-import batchglm.pkg_constants as pkg_constants
-
-from batchglm.models.base_glm import InputDataGLM
-
-
-class Test_Jacobians_GLM_ALL(unittest.TestCase):
-    noise_model: str
-
-    def setUp(self):
-        pass
-
-    def tearDown(self):
-        pass
-
-    def simulate(self):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                from batchglm.api.models.glm_nb import Simulator
-            elif self.noise_model == "norm":
-                from batchglm.api.models.glm_norm import Simulator
-            elif self.noise_model == "beta":
-                from batchglm.api.models.glm_beta import Simulator
-            else:
-                raise ValueError("noise_model not recognized")
-
-        num_observations = 500
-        sim = Simulator(num_observations=num_observations, num_features=4)
-        sim.generate_sample_description(num_conditions=2, num_batches=2)
-        sim.generate()
-
-        self.sim = sim
-
-    def get_jacs(
-            self,
-            input_data: InputDataGLM
-    ):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                from batchglm.api.models.glm_nb import Estimator
-            elif self.noise_model == "norm":
-                from batchglm.api.models.glm_norm import Estimator
-            elif self.noise_model == "beta":
-                from batchglm.api.models.glm_beta import Estimator
-            else:
-                raise ValueError("noise_model not recognized")
-
-        estimator = Estimator(
-            input_data=input_data,
-            init_a=self.sim.a_var,
-            init_b=self.sim.b_var
-        )
-        estimator.initialize()
-        # Do not train, evaluate at initialization!
-        estimator.train_sequence(training_strategy=[
-            {
-                "convergence_criteria": "step",
-                "stopping_criteria": 1,
-                "use_batching": False,
-                "optim_algo": "gd",
-                "train_mu": True,
-                "train_r": True,
-                "autograd": pkg_constants.JACOBIAN_MODE == "tf"
-            },
-        ])
-        estimator.finalize()
-        return estimator.jacobian
-
-    def compare_jacs(
-            self,
-            design,
-            sparse
-    ):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model=="nb":
-                from batchglm.api.models.glm_nb import InputDataGLM
-            elif self.noise_model == "norm":
-                from batchglm.api.models.glm_norm import InputDataGLM
-            elif self.noise_model == "beta":
-                from batchglm.api.models.glm_beta import InputDataGLM
-            else:
-                raise ValueError("noise_model not recognized")
-
-        sample_description = self.sim.sample_description
-        design_loc = data_utils.design_matrix(sample_description, formula=design)
-        design_scale = data_utils.design_matrix(sample_description, formula=design)
-
-        if sparse:
-            input_data = InputDataGLM(
-                data=scipy.sparse.csr_matrix(self.sim.x),
-                design_loc=design_loc,
-                design_scale=design_scale
-            )
-        else:
-            input_data = InputDataGLM(
-                data=self.sim.x,
-                design_loc=design_loc,
-                design_scale=design_scale
-            )
-
-        logging.getLogger("batchglm").debug("** Running analytic Jacobian test")
-        pkg_constants.JACOBIAN_MODE = "analytic"
-        t0_analytic = time.time()
-        J_analytic = self.get_jacs(input_data)
-        t1_analytic = time.time()
-        t_analytic = t1_analytic - t0_analytic
-
-        logging.getLogger("batchglm").debug("** Running tensorflow Jacobian test")
-        pkg_constants.JACOBIAN_MODE = "tf"
-        t0_tf = time.time()
-        J_tf = self.get_jacs(input_data)
-        t1_tf = time.time()
-        t_tf = t1_tf - t0_tf
-
-        # Make sure that jacobians are not all zero which might make evaluation of equality difficult.
-        assert np.sum(np.abs(J_analytic)) > 1e-10, \
-            "jacobians too small to perform test: %f" % np.sum(np.abs(J_analytic))
-
-        logging.getLogger("batchglm").info("run time tensorflow solution: %f" % t_tf)
-        logging.getLogger("batchglm").info("run time observation batch-wise analytic solution: %f" % t_analytic)
-        logging.getLogger("batchglm").info("MAD: %f" % np.max(np.abs((J_tf - J_analytic))))
-        logging.getLogger("batchglm").info("MRAD: %f" % np.max(np.abs((J_tf - J_analytic) / J_tf)))
-
-        #print(J_tf)
-        #print(J_analytic)
-        #print((J_tf - J_analytic) / J_tf)
-
-        mrad = np.max(np.abs((J_tf - J_analytic) / J_tf))
-        assert mrad < 1e-10, mrad # changed 1e-12 to 1e-10
-        return True
-
-    def _test_compute_jacobians(self, sparse):
-        self.simulate()
-        self.compare_jacs(design="~ 1 + condition + batch", sparse=sparse)
-
-
-class Test_Jacobians_GLM_NB(Test_Jacobians_GLM_ALL, unittest.TestCase):
-
-    def test_compute_jacobians_nb(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logging.getLogger("batchglm").error("Test_Jacobians_GLM_NB.test_compute_jacobians_nb()")
-
-        self.noise_model = "nb"
-        self._test_compute_jacobians(sparse=False)
-        #self._test_compute_jacobians(sparse=True)  #TODO automatic differentiation does not seems to work here yet.
-
-"""
-class Test_Jacobians_GLM_NORM(Test_Jacobians_GLM_ALL, unittest.TestCase):
-
-    def test_compute_jacobians_norm(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logging.getLogger("batchglm").error("Test_Jacobians_GLM_NORM.test_compute_jacobians_norm()")
-
-        self.noise_model = "norm"
-        self._test_compute_jacobians(sparse=False)
-        #self._test_compute_jacobians(sparse=True)  #TODO automatic differentiation does not seem to work here yet.
-
-class Test_Jacobians_GLM_BETA(Test_Jacobians_GLM_ALL, unittest.TestCase):
-
-    def test_compute_jacobians_beta(self):
-        logging.getLogger("tensorflow").setLevel(logging.INFO)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logging.getLogger("batchglm").error("Test_Jacobians_GLM_BETA.test_compute_jacobians_beta()")
-
-        self.noise_model = "beta"
-        self._test_compute_jacobians(sparse=False)
-        #self._test_compute_jacobians(sparse=True)  #TODO automatic differentiation does not seem to work here yet.
-"""
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/batchglm/unit_test/test_simulators_glm_all.py b/batchglm/unit_test/test_simulators_glm_all.py
deleted file mode 100644
index 306fbcd1..00000000
--- a/batchglm/unit_test/test_simulators_glm_all.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import logging
-import unittest
-import numpy as np
-
-import batchglm.api as glm
-from batchglm.models.base_glm import _SimulatorGLM, InputDataGLM
-
-glm.setup_logging(verbosity="WARNING", stream="STDOUT")
-logger = logging.getLogger(__name__)
-
-
-class TestSimulationGlmAll:
-
-    sim: _SimulatorGLM
-    input_data: InputDataGLM
-    noise_model: str
-
-    def eval_simulation_mean(
-            self
-    ):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                threshold_dev = 1e-2
-                threshold_std = 1e-1
-            elif self.noise_model == "norm":
-                threshold_dev = 1e-2
-                threshold_std = 1e-1
-            elif self.noise_model == "beta":
-                threshold_dev = 1e-2
-                threshold_std = 1e-1
-            else:
-                raise ValueError("noise_model not recognized")
-
-        means_sim = self.sim.a_var[0, :]
-        means_obs = self.sim.link_loc(np.mean(self.sim.input_data.x, axis=0))
-        mean_dev = np.mean(means_sim - means_obs)
-        std_dev = np.std(means_sim - means_obs)
-
-        logging.getLogger("batchglm").info("mean_dev_a %f" % mean_dev)
-        logging.getLogger("batchglm").info("std_dev_a %f" % std_dev)
-
-        if np.abs(mean_dev) < threshold_dev and \
-                std_dev < threshold_std:
-            return True
-        else:
-            return False
-
-    def _test_all_moments(self):
-        if self.noise_model is None:
-            raise ValueError("noise_model is None")
-        else:
-            if self.noise_model == "nb":
-                from batchglm.api.models.tf1.glm_nb import Simulator
-            elif self.noise_model == "norm":
-                from batchglm.api.models import Simulator
-            elif self.noise_model == "beta":
-                from batchglm.api.models.tf1.glm_beta import Simulator
-            else:
-                raise ValueError("noise_model not recognized")
-
-        self.sim = Simulator(
-            num_observations=100000,
-            num_features=10
-        )
-        self.sim.generate_sample_description(num_batches=1, num_conditions=1)
-        self.sim.generate_params()
-        self.sim.generate_data()
-
-        success = self.eval_simulation_mean()
-        assert success, "mean of simulation was inaccurate"
-        return True
-
-
-class TestSimulationGlmNb(
-    TestSimulationGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for negative binomial data.
-    """
-
-    def test(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestSimulationGlmNb.test()")
-
-        self.noise_model = "nb"
-        self._test_all_moments()
-
-
-class TestSimulationGlmNorm(
-    TestSimulationGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for normally distributed data.
-    """
-
-    def test(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestSimulationGlmNorm.test()")
-
-        self.noise_model = "norm"
-        self._test_all_moments()
-
-
-class TestSimulationGlmBeta(
-    TestSimulationGlmAll,
-    unittest.TestCase
-):
-    """
-    Test whether optimizers yield exact results for beta distributed data.
-    """
-
-    def test(self):
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.INFO)
-        logger.error("TestSimulationGlmBeta.test()")
-
-        self.noise_model = "beta"
-        self._test_all_moments()
-
-
-if __name__ == '__main__':
-    unittest.main()

From 8296133e2e4e98d66e4c2b6177c0f353544a4761 Mon Sep 17 00:00:00 2001
From: ilan-gold <ilanbassgold@gmail.com>
Date: Thu, 27 Jan 2022 18:37:16 +0100
Subject: [PATCH 2/4] Remove TF mentions

---
 README.md                 | 6 +++---
 batchglm/pkg_constants.py | 1 -
 setup.py                  | 4 ----
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 34e95b65..95d70bdc 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 
 # Fast and scalable fitting of over-determined generalized-linear models (GLMs)
 
-batchglm was developed in the context of [diffxpy](https://github.com/theislab/diffxpy) to allow fast model fitting for differential expression analysis for single-cell RNA-seq data. However, one can use batchglm or its concepts in other scenarios where over-determined GLMs are encountered. batchglm is based on TensorFlow 
-
+batchglm was developed in the context of [diffxpy](https://github.com/theislab/diffxpy) to allow fast model fitting for differential expression analysis for single-cell RNA-seq data. However, one can use batchglm or its concepts in other scenarios where over-determined GLMs are encountered.
+<!-- 
 # Installation
 1. Install [tensorflow](https://www.tensorflow.org/install/), see below. Please use the pip installation if you are unsure.
 2. Clone the GitHub repository of batchglm.
@@ -22,4 +22,4 @@ You can install [tensorflow](https://www.tensorflow.org/install/) via pip or via
   `pip install tensorflow-gpu`
   
 ### Hardware-optimized tensorflow installation (compiling from source)
-Please refer to https://www.tensorflow.org/install/.
+Please refer to https://www.tensorflow.org/install/. -->
diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index 9afb32bf..ed9749c8 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -28,7 +28,6 @@
 GTOL_BY_FEATURE_SCALE = 1e-8
 
 try:
-    import tensorflow as tf
 
     TF_NUM_THREADS = int(os.environ.get('TF_NUM_THREADS', 0))
     TF_LOOP_PARALLEL_ITERATIONS = int(os.environ.get('TF_LOOP_PARALLEL_ITERATIONS', 10))
diff --git a/setup.py b/setup.py
index 1397bde7..ef2dd5ab 100644
--- a/setup.py
+++ b/setup.py
@@ -28,10 +28,6 @@
         'dask'
     ],
     extras_require={
-        'optional': [
-            'tensorflow>=1.14.0',
-            'tensorflow-gpu>=1.14.0'
-        ],
         'plotting_deps': [
              "matplotlib",
              "seaborn"

From 70a08d92bca3aa4082d0b5a9e04fe8da39e75378 Mon Sep 17 00:00:00 2001
From: ilan-gold <ilanbassgold@gmail.com>
Date: Thu, 27 Jan 2022 18:38:42 +0100
Subject: [PATCH 3/4] Remove TF2

---
 batchglm/api/models/__init__.py     | 4 ----
 batchglm/api/models/tf2/__init__.py | 3 ---
 batchglm/api/models/tf2/glm_beta.py | 2 --
 batchglm/api/models/tf2/glm_nb.py   | 2 --
 batchglm/api/models/tf2/glm_norm.py | 2 --
 5 files changed, 13 deletions(-)
 delete mode 100644 batchglm/api/models/tf2/__init__.py
 delete mode 100644 batchglm/api/models/tf2/glm_beta.py
 delete mode 100644 batchglm/api/models/tf2/glm_nb.py
 delete mode 100644 batchglm/api/models/tf2/glm_norm.py

diff --git a/batchglm/api/models/__init__.py b/batchglm/api/models/__init__.py
index eff3c3f2..ca70d778 100644
--- a/batchglm/api/models/__init__.py
+++ b/batchglm/api/models/__init__.py
@@ -1,5 +1 @@
 from . import numpy
-try:
-    from . import tf2
-except ImportError:
-    tf2 = None
diff --git a/batchglm/api/models/tf2/__init__.py b/batchglm/api/models/tf2/__init__.py
deleted file mode 100644
index 8fbdb228..00000000
--- a/batchglm/api/models/tf2/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from . import glm_beta
-from . import glm_nb
-from . import glm_norm
diff --git a/batchglm/api/models/tf2/glm_beta.py b/batchglm/api/models/tf2/glm_beta.py
deleted file mode 100644
index 8b5f563e..00000000
--- a/batchglm/api/models/tf2/glm_beta.py
+++ /dev/null
@@ -1,2 +0,0 @@
-#from batchglm.models.glm_beta import InputDataGLM, Model, Simulator
-#from batchglm.train.tf2.glm_beta import Estimator
diff --git a/batchglm/api/models/tf2/glm_nb.py b/batchglm/api/models/tf2/glm_nb.py
deleted file mode 100644
index 8e2ba7a9..00000000
--- a/batchglm/api/models/tf2/glm_nb.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from batchglm.models.glm_nb import InputDataGLM, Model, Simulator
-from batchglm.train.tf2.glm_nb import Estimator
diff --git a/batchglm/api/models/tf2/glm_norm.py b/batchglm/api/models/tf2/glm_norm.py
deleted file mode 100644
index 45fc0453..00000000
--- a/batchglm/api/models/tf2/glm_norm.py
+++ /dev/null
@@ -1,2 +0,0 @@
-#from batchglm.models.glm_norm import InputDataGLM, Model, Simulator
-#from batchglm.train.tf2.glm_norm import Estimator

From 12e75cd84ab5063ddffa866c46b5078e11d7b7a4 Mon Sep 17 00:00:00 2001
From: ilan-gold <ilanbassgold@gmail.com>
Date: Thu, 27 Jan 2022 18:56:32 +0100
Subject: [PATCH 4/4] Remove constants.

---
 batchglm/pkg_constants.py | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index ed9749c8..eefd624c 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -25,24 +25,4 @@
 XTOL_BY_FEATURE_LOC = 1e-8
 XTOL_BY_FEATURE_SCALE = 1e-6
 GTOL_BY_FEATURE_LOC = 1e-8
-GTOL_BY_FEATURE_SCALE = 1e-8
-
-try:
-
-    TF_NUM_THREADS = int(os.environ.get('TF_NUM_THREADS', 0))
-    TF_LOOP_PARALLEL_ITERATIONS = int(os.environ.get('TF_LOOP_PARALLEL_ITERATIONS', 10))
-
-    TF_CONFIG_PROTO = tf.compat.v1.ConfigProto()
-    TF_CONFIG_PROTO.allow_soft_placement = True
-    TF_CONFIG_PROTO.log_device_placement = False
-    TF_CONFIG_PROTO.gpu_options.allow_growth = True
-    TF_CONFIG_PROTO.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
-
-    TF_CONFIG_PROTO.inter_op_parallelism_threads = TF_NUM_THREADS
-    TF_CONFIG_PROTO.intra_op_parallelism_threads = TF_NUM_THREADS
-
-    if TF_NUM_THREADS == 0:
-        TF_NUM_THREADS = multiprocessing.cpu_count()
-
-except ImportError:
-    tf = None
+GTOL_BY_FEATURE_SCALE = 1e-8
\ No newline at end of file