Release GSEA 0.14.0-rc

KwatMDPhD · KwatMDPhD · commit 8008cfdd5c76 · 2023-02-03T09:07:11.000-07:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "GSEA"
 uuid = "f74a9ba8-6f1f-48f2-98b6-a79c46cc06f7"
 authors = ["KwatMDPhD <kwat.me@icloud.com>"]
-version = "0.12.0"
+version = "0.14.0-rc"
 
 [deps]
 BioLab = "1fe83854-81c0-42f7-afc9-71ba9af673ca"
diff --git a/README.md b/README.md
@@ -52,22 +52,6 @@ head -2 output/*.tsv
 open output/plot/*.html
 ```
 
-#### Alternatively, (instead of in command line) run this example in `julia`
-
-```jl
-using GSEA
-
-cd("example.sarcopenia")
-
-GSEA.metric_rank(
-    "metric_rank.json",
-    "target_x_sample_x_number.tsv",
-    "gene_x_sample_x_score.tsv",
-    "set_genes.json",
-    "output",
-)
-```
-
 ## Settings are just a [`.json` file](setting)
 
 - `metric` for ranking genes (for `metric-rank`)
@@ -100,7 +84,7 @@ GSEA.metric_rank(
 
 - `algorithm` for computing enrichment
 
-  `cidac` (_cumulative information divergence with antisymmetricity and complementation_) | `ks` (_Kolmogorov Smirnov_) | `ksa` (`ks` area)
+  `KS` (_Kolmogorov Smirnov_) | `KSA` (`KS` area) | `KL` (cumulative information divergence) | `SKL` (_symmetric_ cumulative information divergence) | `AKL` (_antisymmetric_ cumulative information divergence)
 
 - `number_of_jobs`
 
diff --git a/example.sarcopenia/metric_rank.json b/example.sarcopenia/metric_rank.json
@@ -6,7 +6,7 @@
   "minimum_gene_set_size": 15,
   "maximum_gene_set_size": 500,
   "exponent": 1.0,
-  "algorithm": "ks",
+  "algorithm": "KS",
   "number_of_jobs": 1,
   "permutation": "sample",
   "random_seed": 20150603,
diff --git a/setting/data_rank.json b/setting/data_rank.json
@@ -3,6 +3,6 @@
   "minimum_gene_set_size": 15,
   "maximum_gene_set_size": 500,
   "exponent": 1.0,
-  "algorithm": "cidac",
+  "algorithm": "AKL",
   "number_of_jobs": 1
 }
diff --git a/setting/metric_rank.json b/setting/metric_rank.json
@@ -6,7 +6,7 @@
   "minimum_gene_set_size": 15,
   "maximum_gene_set_size": 500,
   "exponent": 1.0,
-  "algorithm": "cidac",
+  "algorithm": "AKL",
   "number_of_jobs": 1,
   "permutation": "sample",
   "random_seed": 20150603,
diff --git a/setting/user_rank.json b/setting/user_rank.json
@@ -5,7 +5,7 @@
   "minimum_gene_set_size": 15,
   "maximum_gene_set_size": 500,
   "exponent": 1.0,
-  "algorithm": "cidac",
+  "algorithm": "AKL",
   "number_of_jobs": 1,
   "random_seed": 20150603,
   "number_of_permutations": 100,
diff --git a/src/_filter_set!.jl b/src/_filter_set!.jl
@@ -1,11 +1,9 @@
 function _filter_set!(se_fe_, re, in_, mi, ma)
 
-    #
     println("Before filtering sets")
 
     BioLab.Dict.print(se_fe_, 0)
 
-    #
     if re
 
         println("Removing set genes not found in gene-x-sample genes")
@@ -18,7 +16,6 @@ function _filter_set!(se_fe_, re, in_, mi, ma)
 
     end
 
-    #
     println("Keeping sets: $mi <= size <= $ma")
 
     for (se, fe_) in se_fe_
@@ -31,7 +28,6 @@ function _filter_set!(se_fe_, re, in_, mi, ma)
 
     end
 
-    #
     println("After")
 
     BioLab.Dict.print(se_fe_, 0)
diff --git a/src/_plot_mountain.jl b/src/_plot_mountain.jl
@@ -1,19 +1,15 @@
 function _plot_mountain(se_x_st_x_nu, fe, sc, n_ex, pl_, al, fe_, sc_, se_fe_, sy_ar, di)
 
-    #
     n_se = size(se_x_st_x_nu, 1)
 
-    #
     n_ex = min(n_ex, n_se)
 
     co_ = [1, 2]
 
-    #
     for ro in 1:n_ex
 
         se, en = se_x_st_x_nu[ro, co_]
 
-        #
         if en <= 0 && !(se in pl_)
 
             push!(pl_, se)
@@ -22,7 +18,6 @@ function _plot_mountain(se_x_st_x_nu, fe, sc, n_ex, pl_, al, fe_, sc_, se_fe_, s
 
     end
 
-    #
     for ro in n_se:-1:(n_se - n_ex + 1)
 
         se, en = se_x_st_x_nu[ro, co_]
@@ -35,26 +30,14 @@ function _plot_mountain(se_x_st_x_nu, fe, sc, n_ex, pl_, al, fe_, sc_, se_fe_, s
 
     end
 
-    #
     pl = mkpath(joinpath(di, "plot"))
 
-    #
     pop!(sy_ar, :n_jo)
 
-    if al == "cidac"
-
-        fu = BioLab.FeatureSetEnrichment.score_set_new
-
-    elseif al == "ks"
-
-        fu = BioLab.FeatureSetEnrichment.score_set
-
-    end
-
-    #
     for se in pl_
 
-        fu(
+        BioLab.FeatureSetEnrichment.score_set(
+            al,
             fe_,
             sc_,
             se_fe_[se];
diff --git a/src/_tabulate_statistic.jl b/src/_tabulate_statistic.jl
@@ -1,28 +1,21 @@
 function _tabulate_statistic(se_en, se_ra_, ou)
 
-    #
     se_ = collect(keys(se_en))
 
     en_ = collect(values(se_en))
 
-    #
     mkpath(ou)
 
-    #
     if isempty(se_ra_)
 
-        #
         gl_ = gla_ = fill(NaN, length(se_))
 
     else
 
-        #
         ra__ = [collect(values(se_ra)) for se_ra in se_ra_]
 
-        #
         gl_, gla_ = BioLab.Significance.get_p_value_and_adjust(en_, vcat(ra__...))
 
-        #
         se_x_ra_x_en = DataFrame("Set" => se_)
 
         insertcols!(se_x_ra_x_en, (string(id) => ra_ for (id, ra_) in enumerate(ra__))...)
@@ -31,7 +24,6 @@ function _tabulate_statistic(se_en, se_ra_, ou)
 
     end
 
-    #
     se_x_st_x_nu = sort(
         DataFrame(
             "Set" => se_,
diff --git a/src/data_rank.jl b/src/data_rank.jl
@@ -10,13 +10,10 @@ Run data-rank (single-sample) GSEA.
 """
 @cast function data_rank(setting_json, gene_x_sample_x_score_tsv, set_genes_json, output_directory)
 
-    #
     ke_ar = BioLab.Dict.read(setting_json)
 
-    #
     fe_x_sa_x_sc = BioLab.Table.read(gene_x_sample_x_score_tsv)
 
-    #
     se_fe_ = BioLab.Dict.read(set_genes_json)
 
     _filter_set!(
@@ -27,11 +24,10 @@ Run data-rank (single-sample) GSEA.
         ke_ar["maximum_gene_set_size"],
     )
 
-    #
     se_x_sa_x_en = BioLab.FeatureSetEnrichment.score_set(
+        ke_ar["algorithm"],
         fe_x_sa_x_sc,
         se_fe_;
-        al = ke_ar["algorithm"],
         _make_keyword_argument(ke_ar)...,
     )
 
diff --git a/src/metric_rank.jl b/src/metric_rank.jl
@@ -25,18 +25,15 @@ Run metric-rank (standard) GSEA.
     output_directory,
 )
 
-    #
     ke_ar = BioLab.Dict.read(setting_json)
 
-    #
     ta_, sat_, ta_x_sa_x_nu =
         BioLab.DataFrame.separate(BioLab.Table.read(target_x_sample_x_number_tsv))[[2, 3, 4]]
 
     BioLab.Array.error_duplicate(ta_)
 
     BioLab.Matrix.error_bad(ta_x_sa_x_nu, Real)
 
-    #
     fe_, saf_, fe_x_sa_x_sc =
         BioLab.DataFrame.separate(BioLab.Table.read(gene_x_sample_x_score_tsv))[[2, 3, 4]]
 
@@ -46,10 +43,8 @@ Run metric-rank (standard) GSEA.
 
     fe_x_sa_x_sc = fe_x_sa_x_sc[:, indexin(sat_, saf_)]
 
-    #
     mkpath(output_directory)
 
-    #
     bi_ = BitVector(ta_x_sa_x_nu[1, :])
 
     me = ke_ar["metric"]
@@ -61,7 +56,6 @@ Run metric-rank (standard) GSEA.
         DataFrame("Gene" => fe_, me => sc_),
     )
 
-    #
     se_fe_ = BioLab.Dict.read(set_genes_json)
 
     _filter_set!(
@@ -72,13 +66,12 @@ Run metric-rank (standard) GSEA.
         ke_ar["maximum_gene_set_size"],
     )
 
-    #
+    al = ke_ar["algorithm"]
+
     fe = ke_ar["feature_name"]
 
     sc = ke_ar["score_name"]
 
-    al = ke_ar["algorithm"]
-
     sy_ar = _make_keyword_argument(ke_ar)
 
     pe = ke_ar["permutation"]
@@ -91,31 +84,23 @@ Run metric-rank (standard) GSEA.
 
     pl_ = ke_ar["gene_sets_to_plot"]
 
-    #
     if pe == "sample"
 
-        #
-        fu, id = BioLab.FeatureSetEnrichment._match_algorithm(al)
-
-        se_en = Dict(se => en[id] for (se, en) in fu(fe_, sc_, se_fe_; sy_ar...))
+        se_en = BioLab.FeatureSetEnrichment.score_set(al, fe_, sc_, se_fe_; sy_ar...)
 
-        #
         if 0 < n_pe
 
             println("Permuting $(pe)s to compute significance")
 
-            #
             seed!(ra)
 
-            #
             se_ra_ = [
-                Dict(se => en[id] for (se, en) in se_en) for se_en in (
-                    fu(
-                        _compare_and_sort(shuffle!(bi_), fe_x_sa_x_sc, me, fe_)...,
-                        se_fe_;
-                        sy_ar...,
-                    ) for _ in ProgressBar(1:n_pe)
-                )
+                BioLab.FeatureSetEnrichment.score_set(
+                    al,
+                    _compare_and_sort(shuffle!(bi_), fe_x_sa_x_sc, me, fe_)...,
+                    se_fe_;
+                    sy_ar...,
+                ) for _ in ProgressBar(1:n_pe)
             ]
 
         else
@@ -124,7 +109,6 @@ Run metric-rank (standard) GSEA.
 
         end
 
-        #
         se_x_st_x_nu = _tabulate_statistic(se_en, se_ra_, output_directory)
 
         _plot_mountain(
@@ -143,12 +127,10 @@ Run metric-rank (standard) GSEA.
 
         se_x_st_x_nu
 
-        #
     elseif pe == "set"
 
-        user_rank(fe_, sc_, se_fe_, fe, sc, al, sy_ar, ra, n_pe, n_ex, pl_, output_directory)
+        user_rank(al, fe_, sc_, se_fe_, fe, sc, sy_ar, ra, n_pe, n_ex, pl_, output_directory)
 
-        #
     else
 
         error("`permutation` is not `sample` or `set`.")
diff --git a/src/user_rank.jl b/src/user_rank.jl
diff --git a/test/data/small/metric_rank.json b/test/data/small/metric_rank.json
diff --git a/test/runtests.ipynb b/test/runtests.ipynb
diff --git a/test/runtests.jl b/test/runtests.jl

Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,6 @@`
`3`	`3`	`"minimum_gene_set_size": 15,`
`4`	`4`	`"maximum_gene_set_size": 500,`
`5`	`5`	`"exponent": 1.0,`
`6`		`- "algorithm": "cidac",`
	`6`	`+ "algorithm": "AKL",`
`7`	`7`	`"number_of_jobs": 1`
`8`	`8`	`}`