got coefficient comparison to OLS model to work

davidsebfischer · davidsebfischer · commit 8f1a0a0518f2 · 2019-04-03T17:09:16.000+02:00
diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py
@@ -292,7 +292,8 @@ def plot_volcano(
             highlight_col: str = "red",
             show: bool = True,
             save: Union[str, None] = None,
-            suffix: str = "_volcano.png"
+            suffix: str = "_volcano.png",
+            return_axs: bool = False
     ):
         """
         Returns a volcano plot of p-value vs. log fold change
@@ -314,6 +315,7 @@ def plot_volcano(
         :param save: Path+file name stem to save plots to.
             File will be save+suffix. Does not save if save is None.
         :param suffix: Suffix for file name to save plot to. Also use this to set the file type.
+        :param return_axs: Whether to return axis objects.
 
         :return: Tuple of matplotlib (figure, axis)
         """
@@ -379,7 +381,10 @@ def plot_volcano(
 
         plt.close(fig)
 
-        return ax
+        if return_axs:
+            return ax
+        else:
+            return
 
     def plot_ma(
             self,
@@ -393,7 +398,8 @@ def plot_ma(
             highlight_col: str = "red",
             show: bool = True,
             save: Union[str, None] = None,
-            suffix: str = "_my_plot.png"
+            suffix: str = "_ma_plot.png",
+            return_axs: bool = False
     ):
         """
         Returns an MA plot of mean expression vs. log fold change with significance
@@ -416,7 +422,7 @@ def plot_ma(
         :param save: Path+file name stem to save plots to.
             File will be save+suffix. Does not save if save is None.
         :param suffix: Suffix for file name to save plot to. Also use this to set the file type.
-
+        :param return_axs: Whether to return axis objects.
 
         :return: Tuple of matplotlib (figure, axis)
         """
@@ -487,7 +493,10 @@ def plot_ma(
         plt.close(fig)
         plt.ion()
 
-        return ax
+        if return_axs:
+            return ax
+        else:
+            return
 
 
 class _DifferentialExpressionTestSingle(_DifferentialExpressionTest, metaclass=abc.ABCMeta):
@@ -756,7 +765,8 @@ class DifferentialExpressionTestWald(_DifferentialExpressionTestSingle):
     def __init__(
             self,
             model_estim: _Estimation,
-            col_indices: np.ndarray
+            col_indices: np.ndarray,
+            noise_model: str,
     ):
         """
         :param model_estim:
@@ -766,6 +776,7 @@ def __init__(
 
         self.model_estim = model_estim
         self.coef_loc_totest = col_indices
+        self.noise_model = noise_model
 
         try:
             if model_estim._error_codes is not None:
@@ -889,7 +900,18 @@ def summary(self, qval_thres=None, fc_upper_thres=None,
 
         return res
 
-    def plot_vs_ttest(self, log10=False):
+    def plot_vs_ttest(
+            self,
+            log10=False,
+            return_axs: bool = False
+    ):
+        """
+
+        :param log10:
+        :param return_axs: Whether to return axis objects.
+
+        :return:
+        """
         import matplotlib.pyplot as plt
         import seaborn as sns
         from .tests import t_test
@@ -913,7 +935,139 @@ def plot_vs_ttest(self, log10=False):
 
         ax.set(xlabel="t-test", ylabel='wald test')
 
-        return fig, ax
+        if return_axs:
+            return ax
+        else:
+            return
+
+    def plot_comparison_ols(
+            self,
+            size=20,
+            show: bool = True,
+            save: Union[str, None] = None,
+            suffix: str = "_ols_comparison.png",
+            ncols=5,
+            row_gap=0.3,
+            col_gap=0.25,
+            return_axs: bool = False
+    ):
+        """
+        Plot location model coefficients of inferred model against those obtained from an OLS model.
+
+        Red line shown is the identity line.
+
+        :param size: Size of points.
+        :param show: Whether (if save is not None) and where (save indicates dir and file stem) to display plot.
+        :param save: Path+file name stem to save plots to.
+            File will be save+suffix. Does not save if save is None.
+        :param suffix: Suffix for file name to save plot to. Also use this to set the file type.
+        :param ncols: Number of columns in plot grid if multiple genes are plotted.
+        :param row_gap: Vertical gap between panel rows relative to panel height.
+        :param col_gap: Horizontal gap between panel columns relative to panel width.
+        :param return_axs: Whether to return axis objects.
+
+        :return: Matplotlib axis objects.
+        """
+        import seaborn as sns
+        import matplotlib.pyplot as plt
+        from matplotlib import gridspec
+        from matplotlib import rcParams
+        from batchglm.api.models.glm_norm import Estimator, InputData
+
+        # Run OLS model fit to have comparison coefficients.
+        input_data_ols = InputData.new(
+            data=self.model_estim.input_data.data,
+            design_loc=self.model_estim.input_data.design_loc,
+            design_scale=self.model_estim.input_data.design_scale[:, [0]],
+            constraints_loc=self.model_estim.input_data.constraints_loc,
+            constraints_scale=self.model_estim.input_data.constraints_scale[[0], [0]],
+            size_factors=self.model_estim.input_data.size_factors,
+            feature_names=self.model_estim.input_data.features,
+        )
+        estim_ols = Estimator(
+            input_data=input_data_ols,
+            init_model=None,
+            init_a="standard",
+            init_b="standard",
+            dtype=self.model_estim.a_var.dtype
+        )
+        estim_ols.initialize()
+        store_ols = estim_ols.finalize()
+
+        # Prepare parameter summary of both model fits.
+        par_loc = input_data_ols.data.coords["design_loc_params"].values
+        if self.noise_model == "nb":
+            # Translate coefficients from OLS fit to be multiplicative in identity space.
+            a_var_ols = store_ols.a_var.values
+            a_var_ols[1:, :] = (a_var_ols[1:, :] + a_var_ols[[0], :]) / a_var_ols[[0], :]
+        elif self.noise_model == "norm":
+            a_var_ols = store_ols.a_var
+        else:
+            raise ValueError("noise model %s not yet supported for plot_comparison_ols" % self.noise_model)
+
+        summaries_fits = [
+            pd.DataFrame({
+                "user": self.model_estim.inverse_link_loc(self.model_estim.a_var[i, :]),
+                "ols": a_var_ols[i, :],
+                "coef": par_loc[i]
+            }) for i in range(self.model_estim.a_var.shape[0])
+        ]
+
+        plt.ioff()
+        nrows = len(par_loc) // ncols + int((len(par_loc) % ncols) > 0)
+
+        gs = gridspec.GridSpec(
+            nrows=nrows,
+            ncols=ncols,
+            hspace=row_gap,
+            wspace=col_gap
+        )
+        fig = plt.figure(
+            figsize=(
+                ncols * rcParams['figure.figsize'][0],  # width in inches
+                nrows * rcParams['figure.figsize'][1] * (1 + row_gap)  # height in inches
+            )
+        )
+
+        axs = []
+        for i, par_i in enumerate(par_loc):
+            ax = plt.subplot(gs[i])
+            axs.append(ax)
+
+            x = summaries_fits[i]["user"].values
+            y = summaries_fits[i]["ols"].values
+
+            sns.scatterplot(
+                x=x,
+                y=y,
+                ax=ax,
+                s=size
+            )
+            sns.lineplot(
+                x=np.array([np.min([np.min(x), np.min(y)]), np.max([np.max(x), np.max(y)])]),
+                y=np.array([np.min([np.min(x), np.min(y)]), np.max([np.max(x), np.max(y)])]),
+                ax=ax,
+                color="red",
+                legend=False
+            )
+            ax.set(xlabel="user supplied model", ylabel="OLS model")
+            title_i = par_loc[i] + " (R=" + str(np.round(np.corrcoef(x, y)[0, 1], 3)) + ")"
+            ax.set_title(title_i)
+
+        # Save, show and return figure.
+        if save is not None:
+            plt.savefig(save + suffix)
+
+        if show:
+            plt.show()
+
+        plt.close(fig)
+        plt.ion()
+
+        if return_axs:
+            return axs
+        else:
+            return
 
 
 class DifferentialExpressionTestTT(_DifferentialExpressionTestSingle):
diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py
@@ -294,7 +294,7 @@ def lrt(
     """
     # TODO test nestedness
     if len(kwargs) != 0:
-        logger.info("additional kwargs: %s", str(kwargs))
+        logging.getLogger("diffxpy").info("additional kwargs: %s", str(kwargs))
 
     if isinstance(as_numeric, str):
         as_numeric = [as_numeric]
@@ -486,7 +486,7 @@ def wald(
     :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
     """
     if len(kwargs) != 0:
-        logger.debug("additional kwargs: %s", str(kwargs))
+        logging.getLogger("diffxpy").debug("additional kwargs: %s", str(kwargs))
 
     if dmat_loc is None and formula_loc is None:
         raise ValueError("Supply either dmat_loc or formula_loc or formula.")
@@ -519,9 +519,11 @@ def wald(
                 if noise_model.lower() not in ["normal", "norm"]:
                     if init_a == "closed_form":
                         init_a = "standard"
-                        logger.warning("Setting init_a to standard as numeric predictors were supplied.")
-                        logger.warning("Closed-form initialisation is not possible" +
-                                       " for noise model %s with numeric predictors." % noise_model)
+                        logging.getLogger("diffxpy").warning(
+                            "Setting init_a to standard as numeric predictors were supplied.")
+                        logging.getLogger("diffxpy").warning(
+                            "Closed-form initialisation is not possible" +
+                            " for noise model %s with numeric predictors." % noise_model)
                     elif init_a == "AUTO":
                         init_a = "standard"
     else:
@@ -538,9 +540,11 @@ def wald(
             if np.any([True if x in as_numeric else False for x in sample_description.columns.values]):
                 if init_b == "closed_form":
                     init_b = "standard"
-                    logger.warning("Setting init_b to standard as numeric predictors were supplied.")
-                    logger.warning("Closed-form initialisation is not possible" +
-                                   " for noise model %s with numeric predictors." % noise_model)
+                    logging.getLogger("diffxpy").warning(
+                        "Setting init_b to standard as numeric predictors were supplied.")
+                    logging.getLogger("diffxpy").warning(
+                        "Closed-form initialisation is not possible" +
+                        " for noise model %s with numeric predictors." % noise_model)
                 elif init_b == "AUTO":
                     init_b = "standard"
     else:
@@ -607,7 +611,8 @@ def wald(
 
     de_test = DifferentialExpressionTestWald(
         model_estim=model,
-        col_indices=col_indices
+        col_indices=col_indices,
+        noise_model=noise_model
     )
 
     return de_test
@@ -968,7 +973,7 @@ def pairwise(
     :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
     """
     if len(kwargs) != 0:
-        logger.info("additional kwargs: %s", str(kwargs))
+        logging.getLogger("diffxpy").info("additional kwargs: %s", str(kwargs))
 
     if lazy and not (test.lower() == 'z-test' or test.lower() == 'z_test' or test.lower() == 'ztest'):
         raise ValueError("lazy evaluation of pairwise tests only possible if test is z-test")
@@ -1172,7 +1177,7 @@ def versus_rest(
     :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
     """
     if len(kwargs) != 0:
-        logger.info("additional kwargs: %s", str(kwargs))
+        logging.getLogger("diffxpy").info("additional kwargs: %s", str(kwargs))
 
     # Do not store all models but only p-value and q-value matrix:
     # genes x groups
@@ -1811,10 +1816,10 @@ def continuous_1d(
         else:
             factor_loc_totest_new = factor_loc_totest
 
-        logger.debug("model formulas assembled in de.test.continuos():")
-        logger.debug("factor_loc_totest_new: " + ",".join(factor_loc_totest_new))
-        logger.debug("formula_loc_new: " + formula_loc_new)
-        logger.debug("formula_scale_new: " + formula_scale_new)
+        logging.getLogger("diffxpy").debug("model formulas assembled in de.test.continuos():")
+        logging.getLogger("diffxpy").debug("factor_loc_totest_new: " + ",".join(factor_loc_totest_new))
+        logging.getLogger("diffxpy").debug("formula_loc_new: " + formula_loc_new)
+        logging.getLogger("diffxpy").debug("formula_scale_new: " + formula_scale_new)
 
         de_test = wald(
             data=X,
@@ -1837,6 +1842,7 @@ def continuous_1d(
         )
         de_test = DifferentialExpressionTestWaldCont(
             de_test=de_test,
+            noise_model=noise_model,
             size_factors=size_factors,
             continuous_coords=sample_description[continuous].values,
             spline_coefs=new_coefs
@@ -1860,11 +1866,11 @@ def continuous_1d(
         full_formula_scale = formula_scale_new
         reduced_formula_scale = formula_scale_new
 
-        logger.debug("model formulas assembled in de.test.continuous():")
-        logger.debug("full_formula_loc: " + full_formula_loc)
-        logger.debug("reduced_formula_loc: " + reduced_formula_loc)
-        logger.debug("full_formula_scale: " + full_formula_scale)
-        logger.debug("reduced_formula_scale: " + reduced_formula_scale)
+        logging.getLogger("diffxpy").debug("model formulas assembled in de.test.continuous():")
+        logging.getLogger("diffxpy").debug("full_formula_loc: " + full_formula_loc)
+        logging.getLogger("diffxpy").debug("reduced_formula_loc: " + reduced_formula_loc)
+        logging.getLogger("diffxpy").debug("full_formula_scale: " + full_formula_scale)
+        logging.getLogger("diffxpy").debug("reduced_formula_scale: " + reduced_formula_scale)
 
         de_test = lrt(
             data=X,