bug fix

richardkwo · richardkwo · commit a4d55643e054 · 2020-12-03T04:02:15.000-08:00
diff --git a/.travis.yml b/.travis.yml
@@ -1,41 +1,23 @@
 ## Documentation: http://docs.travis-ci.com/user/languages/julia/
 language: julia
-
-fast_finish: true
-
-group: travis_latest
-
 os:
   - linux
   - osx
-
-addons:
-  apt:
-    packages: ['gfortran','liblapack-dev','libopenmpi-dev']
-
 julia:
-  - 0.6
+  - 1.5
+  - 1.1
   - nightly
-
 notifications:
   email: false
-
 git:
-  depth: 99999
+  depth: 99999999
 
-before_install:
-  - if [[ $TRAVIS_OS_NAME == osx ]]; then
-      brew update > /dev/null;
-      brew install gcc || true;
-      brew link --overwrite gcc;
-      brew install lapack open-mpi > /dev/null;
-      export FC=gfortran;
-    fi
-
-# (tests will run but not make your overall status red)
+## uncomment the following lines to allow failures on nightly julia
+## (tests will run but not make your overall status red)
 matrix:
- allow_failures:
- - julia: nightly
+  allow_failures:
+    - julia: nightly
+
 
 after_success:
-- julia -e 'cd(Pkg.dir("InvariantCausal")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(process_folder())'
+  - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Coveralls.submit(process_folder())'
diff --git a/Project.toml b/Project.toml
@@ -7,6 +7,7 @@ version = "1.0.0"
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
 GLMNet = "8d5ece8b-de18-5317-b113-243142960cc6"
@@ -15,8 +16,8 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
-UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 
 [compat]
 CategoricalArrays = "0.9"
@@ -29,5 +30,3 @@ StatsBase = "0.33"
 StatsModels = "0.6"
 UnicodePlots = "1.3"
 julia = "1"
-
-
diff --git a/README.md b/README.md
@@ -4,13 +4,14 @@
 
 ![college](docs/college.png)
 
-This is a Julia v.0.6 implementation for the Invariant Causal Prediction algorithm of [Peters, Bühlmann and Meinshausen](https://doi.org/10.1111/rssb.12167). The method uncovers direct causes of a target variable from datasets under different environments (e.g., interventions or experimental settings). 
+This is a **Julia 1.x** implementation for the **Invariant Causal Prediction** algorithm of [Peters, Bühlmann and Meinshausen](https://doi.org/10.1111/rssb.12167). The method uncovers direct causes of a target variable from datasets under different environments (e.g., interventions or experimental settings). 
 
 See also this [R package](https://cran.r-project.org/package=InvariantCausalPrediction) and [this report](docs/InvariantCausal.pdf).
 
 #### Changelog
 
-- 2018/06/20: version 0.1.1
+- 2020/12/03: version 1.0.0 (Julia 1.x)
+- 2018/06/20: version 0.1.1 (Julia 0.6)
 
 #### Dependencies
 
@@ -44,122 +45,146 @@ Generate a simple [Gaussian structure equation model](https://en.wikipedia.org/w
 ```julia
 julia> using InvariantCausal
 julia> using Random
-julia> Random.seed!(1926)
+julia> Random.seed!(77)
 julia> sem_obs = random_gaussian_SEM(21, 3)
+
 Gaussian SEM with 21 variables:
 B =
       Sparsity Pattern
       ┌───────────┐
-    1 │⠀⠀⡄⠀⠐⡠⠀⡀⢢⡄⠀│ > 0
-      │⠠⠀⠄⡀⡸⡠⠠⡀⠀⢠⠀│ < 0
-      │⠀⠈⠠⠀⠈⠉⠀⠄⠀⠀⠀│
-      │⠀⢂⠢⠀⢨⢀⠀⡀⢀⠂⡒│
-      │⠀⠀⠀⠀⠠⢲⠀⠄⠀⠀⠐│
-   21 │⠀⠀⠹⠀⠀⠐⠐⠆⠐⠥⠀│
+    1 │⠀⠠⠀⠀⢐⠀⠀⠄⠀⢔⠀│ > 0
+      │⠠⠀⠠⠨⠁⠀⠄⠀⠀⠸⠀│ < 0
+      │⠠⠈⠈⠀⠌⠠⠀⠅⠀⠩⠉│
+      │⠠⣨⠴⠰⠪⠠⠄⠀⠸⠉⣐│
+      │⢀⠲⠈⢠⠠⠀⠀⠂⠀⠲⠁│
+   21 │⠀⠐⠀⠀⠠⠠⠀⠀⠀⠔⠀│
       └───────────┘
       1          21
-        nz = 63σ² = [1.3995969539576336, 1.3797542626927117, 1.8725924411035275, 1.1558670231511754, 0.6313157118985134, 1.3861564933413408, 1.4515091017758692, 1.7392330458711087, 1.55834175481778, 1.1102263218265493, 1.2459898446608833, 0.9582172366364653, 0.8341414371776826, 1.9452530507977812, 1.48880401416046, 1.5359339337413704, 1.691737599591161, 0.6496166911064964, 1.1210005303098285, 1.1459738623697713, 0.6920288559801938]
+        nz = 70σ² = [1.9727697778060356, 1.1224733663047743, 1.1798805640594814, 1.2625825149076064, 0.8503782631176267, 0.5262963446298372, 1.3835334059064883, 1.788996301274282, 1.759286517329432, 0.842571682652995, 1.713382150423666, 1.4524484793202235, 1.9464648511794784, 1.7729995603828317, 0.7110857327642559, 1.6837378902964577, 1.085405687408806, 1.3069888003095986, 1.3933773717634643, 1.0571823834646068, 1.9187793877731028]
 ```
 
-Suppose we want to infer the direct causes for the last variables, which are
+Suppose we want to infer the direct causes for the last variables, i.e., 9, 11 and 18.
 
 ```julia
 julia> causes(sem_obs, 21)
-2-element Array{Int64,1}:
- 2
- 5
+3-element Array{Int64,1}:
+  9
+ 11
+ 18
 ```
 
-Firstly, let us generate some observational data and call it environment 1.
+Firstly, let us generate some observational data and call it **environment 1**.
 
 ```julia
 julia> X1 = simulate(sem_obs, 1000)
 ```
 
-Then, we simulate from environment 2 by performing do-intervention on variables 3, 4, 5, 6. Here we set them to fixed random values.
+Then, we simulate from **environment 2** by performing **do-intervention** on variables 3, 4, 5, 6. Here we set them to fixed random values.
 
 ```julia
 julia> X2 = simulate(sem_obs, [3,4,5,6], randn(4), 1000)
 ```
 
-We run the algorithm on environments 1 and 2.
+We run the algorithm on **environments 1 and 2**.
 
 ```julia
 julia> causalSearch(vcat(X1, X2)[:,1:20], vcat(X1, X2)[:,21], repeat([1,2], inner=1000))
 
-8 variables are screened out from 20 variables with lasso: [2, 5, 6, 8, 13, 15, 16, 20]
-Causal invariance search across 2 environments with at α=0.01 (|S| = 8, method = chow)
-
-S = []                                      : p-value = 0.0000 [ ] ⋂ = [2, 5, 6, 8, 13, 15, 16, 20]
-S = [2]                                     : p-value = 0.1376 [*] ⋂ = [2]
-S = [20]                                    : p-value = 0.0000 [ ] ⋂ = [2]
-S = [16]                                    : p-value = 0.0000 [ ] ⋂ = [2]
-S = [15]                                    : p-value = 0.0000 [ ] ⋂ = [2]
-                                     ...
-S = [2, 5, 6]                               : p-value = 0.3557 [*] ⋂ = [2]
-S = [5, 6, 20]                              : p-value = 0.1879 [*] ⋂ = Int64[]
+8 variables are screened out from 20 variables with lasso: [5, 7, 8, 9, 11, 12, 15, 17]
+Causal invariance search across 2 environments with at α=0.01 (|S| = 8, method = chow, model = linear)
+
+S = []                                      : p-value = 0.0000 [ ] ⋂ = [5, 7, 8, 9, 11, 12, 15, 17]
+S = [5]                                     : p-value = 0.0000 [ ] ⋂ = [5, 7, 8, 9, 11, 12, 15, 17]
+S = [17]                                    : p-value = 0.0000 [ ] ⋂ = [5, 7, 8, 9, 11, 12, 15, 17]
+S = [15]                                    : p-value = 0.0000 [ ] ⋂ = [5, 7, 8, 9, 11, 12, 15, 17]
+S = [12]                                    : p-value = 0.0000 [ ] ⋂ = [5, 7, 8, 9, 11, 12, 15, 17]
+S = [11]                                    : p-value = 0.0144 [*] ⋂ = [11]
+S = [9]                                     : p-value = 0.0000 [ ] ⋂ = [11]
+S = [8]                                     : p-value = 0.0000 [ ] ⋂ = [11]
+S = [7]                                     : p-value = 0.0000 [ ] ⋂ = [11]
+S = [11, 5]                                 : p-value = 0.0000 [ ] ⋂ = [11]
+S = [11, 12]                                : p-value = 0.0000 [ ] ⋂ = [11]
+S = [11, 15]                                : p-value = 0.0007 [ ] ⋂ = [11]
+S = [7, 11]                                 : p-value = 0.0082 [ ] ⋂ = [11]
+S = [11, 8]                                 : p-value = 0.0000 [ ] ⋂ = [11]
+S = [9, 11]                                 : p-value = 0.0512 [*] ⋂ = [11]
+S = [17, 11]                                : p-value = 0.0000 [ ] ⋂ = [11]
+S = [9, 12]                                 : p-value = 0.0000 [ ] ⋂ = [11]
+S = [9, 15]                                 : p-value = 0.0064 [ ] ⋂ = [11]
+S = [7, 9]                                  : p-value = 0.0000 [ ] ⋂ = [11]
+S = [9, 8]                                  : p-value = 0.0000 [ ] ⋂ = [11]
+S = [9, 5]                                  : p-value = 0.7475 [*] ⋂ = Int64[]
+
+Tested 21 sets: 3 sets are accepted.
 
  * Found no causal variable (empty intersection).
 
- ⋅ Variables considered include [2, 5, 6, 8, 13, 15, 16, 20]
+ ⋅ Variables considered include [5, 7, 8, 9, 11, 12, 15, 17]
 ```
 
-The algorithm cannot find any direct causal variables (parents) of variable 21 due to insufficient power of two environments. The algorithm tends to discover more with more environments. Let us define a new environment where we perform a noise (soft) intervention that changes the equations for 5 variables other than the target. Note it is important that the target is left untouched.
+The algorithm **cannot find any** direct causal variables (parents) of variable 21 due to **insufficient power** of two environments. The algorithm tends to **discover more** with **more environments**. Let us define a new environment where we perform a **noise (soft) intervention** that changes the equations for 5 variables other than the target. Note it is important that the **target** is left **untouched**.
 
 ```Julia
 julia> sem_noise, variables_intervened = random_noise_intervened_SEM(sem_obs, p_intervened=5, avoid=[21])
 
 (Gaussian SEM with 21 variables:
 B =
       Sparsity Pattern
-      ┌─────────────┐
-    1 │⠀⠀⠂⠄⠀⠔⠀⠀⠂⠂⡆│ > 0
-      │⢀⢠⠈⡀⠠⠠⣀⠀⠀⠅⠀│ < 0
-      │⠀⠐⠉⠀⠈⠠⠘⠀⠀⠆⠉│
-      │⠀⠐⢠⠀⠀⡀⠐⠀⢂⠀⡂│
-      │⠀⠠⢐⠀⠉⠵⠠⠁⠄⠈⠂│
-   21 │⠈⠄⠸⠀⠀⠈⠀⠀⠉⠀⠁│
-      └─────────────┘
+      ┌───────────┐
+    1 │⠀⠠⠀⠀⢐⠀⠀⠄⠀⢔⠀│ > 0
+      │⠠⠀⠠⠨⠁⠀⠄⠀⠀⠸⠀│ < 0
+      │⠠⠈⠈⠀⠌⠠⠀⠅⠀⠩⠉│
+      │⠠⣨⠴⠰⠪⠠⠄⠀⠸⠉⣐│
+      │⢀⠲⠈⢠⠠⠀⠀⠂⠀⠲⠁│
+   21 │⠀⠐⠀⠀⠠⠠⠀⠀⠀⠔⠀│
+      └───────────┘
       1          21
-        nz = 63
-σ² = [1.3996, 1.20882, 1.87259, 1.15587, 0.631316, 1.38616, 1.45151, 1.73923, 2.55396, 1.11023, 1.24599, 0.958217, 0.506628, 1.94525, 2.16212, 1.53593, 1.69174, 0.649617, 1.121, 2.19366, 0.692029], [9, 15, 13, 2, 20])
+        nz = 70σ² = [1.9727697778060356, 1.1224733663047743, 1.1798805640594814, 1.2625825149076064, 0.8503782631176267, 0.5262963446298372, 1.3835334059064883, 1.788996301274282, 1.759286517329432, 0.5837984015051159, 3.01957479564807, 0.9492838187140921, 1.9398913901673531, 1.7729995603828317, 0.7110857327642559, 1.6837378902964577, 1.2089053651343495, 1.3069888003095986, 1.3933773717634643, 1.0571823834646068, 1.9187793877731028], [17, 13, 10, 11, 12])
 ```
 
-Here the equations for variables 9, 15, 13, 2, 20 have been changed. Now we simulate from this modified SEM and call it environment 3. We run the algorithm on all 3 environments.
+Here the equations for variables 17, 13, 10, 11, 12 have been changed. Now we simulate from this modified SEM and call it **environment 3**. We run the algorithm on all **3 environments**.
 
 ```Julia
 julia> X3 = simulate(sem_noise, 1000)
 julia> causalSearch(vcat(X1, X2, X3)[:,1:20], vcat(X1, X2, X3)[:,21], repeat([1,2,3], inner=1000))
 ```
 
-The algorithm searches over subsets for a while and successfully discovers variables 2. 
+The algorithm searches over subsets for a while and successfully **discovers** variables 11. The other two causes, 9 and 18, can hopefully be discovered given even more environments.
 
 ```
-8 variables are screened out from 20 variables with lasso: [1, 2, 5, 6, 8, 13, 15, 20]
-Causal invariance search across 3 environments with at α=0.01 (|S| = 8, method = chow)
-
-S = []                                      : p-value = 0.0000 [ ] ⋂ = [1, 2, 5, 6, 8, 13, 15, 20]
-S = [1]                                     : p-value = 0.0000 [ ] ⋂ = [1, 2, 5, 6, 8, 13, 15, 20]
-S = [20]                                    : p-value = 0.0000 [ ] ⋂ = [1, 2, 5, 6, 8, 13, 15, 20]
-S = [15]                                    : p-value = 0.0000 [ ] ⋂ = [1, 2, 5, 6, 8, 13, 15, 20]
-S = [13]                                    : p-value = 0.0000 [ ] ⋂ = [1, 2, 5, 6, 8, 13, 15, 20]
-S = [8]                                     : p-value = 0.0000 [ ] ⋂ = [1, 2, 5, 6, 8, 13, 15, 20]
-S = [6]                                     : p-value = 0.0000 [ ] ⋂ = [1, 2, 5, 6, 8, 13, 15, 20]
-S = [5]                                     : p-value = 0.0001 [ ] ⋂ = [1, 2, 5, 6, 8, 13, 15, 20]
-S = [2]                                     : p-value = 0.1714 [*] ⋂ = [2]
-S = [5, 1]                                  : p-value = 0.0000 [ ] ⋂ = [2]
-S = [2, 5]                                  : p-value = 0.2211 [*] ⋂ = [2]
-S = [5, 20]                                 : p-value = 0.0000 [ ] ⋂ = [2]
-                                      ...
-S = [1, 13, 2, 5, 8, 15, 6]                 : p-value = 0.4380 [*] ⋂ = [2]
-S = [20, 6, 13, 2, 5, 8, 15, 1]             : p-value = 0.6916 [*] ⋂ = [2]
-
- * Causal variables include: [2]
-
-variable   	 1.0 % 		 99.0 %
-2          	 0.5831 	 0.7054
-
- ⋅ Variables considered include [1, 2, 5, 6, 8, 13, 15, 20]
+causalSearch(vcat(X1, X2, X3)[:,1:20], vcat(X1, X2, X3)[:,21], repeat([1,2,3], inner=1000))
+8 variables are screened out from 20 variables with lasso: [4, 5, 7, 8, 9, 11, 12, 16]
+Causal invariance search across 3 environments with at α=0.01 (|S| = 8, method = chow, model = linear)
+
+S = []                                      : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [4]                                     : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [16]                                    : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [12]                                    : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [11]                                    : p-value = 0.0084 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [9]                                     : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [8]                                     : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [7]                                     : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [5]                                     : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [4, 11]                                 : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [11, 5]                                 : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [11, 8]                                 : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [7, 11]                                 : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [9, 11]                                 : p-value = 0.0000 [ ] ⋂ = [4, 5, 7, 8, 9, 11, 12, 16]
+S = [16, 11]                                : p-value = 0.0709 [*] ⋂ = [11, 16]
+S = [11, 12]                                : p-value = 0.0000 [ ] ⋂ = [11, 16]
+																			...
+S = [7, 9, 4, 16, 11, 5, 12]                : p-value = 0.0000 [ ] ⋂ = [11]
+S = [7, 9, 4, 16, 11, 8, 12]                : p-value = 0.0001 [ ] ⋂ = [11]
+S = [7, 4, 9, 16, 11, 5, 8, 12]             : p-value = 0.0002 [ ] ⋂ = [11]
+
+Tested 256 sets: 6 sets are accepted.
+
+ * Causal variables include: [11]
+
+   variable   	 1.0 % 		 99.0 %
+   11         	 0.1123 	 1.1017
+
+ ⋅ Variables considered include [4, 5, 7, 8, 9, 11, 12, 16]
 ```
 
 ### Functionalities
@@ -181,24 +206,8 @@ variable   	 1.0 % 		 99.0 %
 
 ###  Features
 
-- High performance implementation in Julia v.0.6
+- High performance implementation in Julia v1.x
 - Faster search: 
   - skipping testing supersets of A if A is accepted ( under  `selection_only` mode)
   - Priority queue to prioritize testing sets likely to be invariant
 
-### Todo
-
-- ~~Confidence intervals~~
-- ~~Logistic regression~~
-- ~~Variable screening~~
-  - ~~glmnet~~
-  - ~~HOLP~~
-- ~~Subsampling for large n in Chow's test~~
-- Nonparametric two-sample tests
-- Hidden variable case
-- ~~Inference of graph and plotting~~
-
-### Issues
-
-- ~~Better reporting~~
-
diff --git a/src/causalSearch.jl b/src/causalSearch.jl
@@ -106,7 +106,7 @@ function causalSearch(X::Union{Matrix{Float64}, DataFrame}, y::Vector{Float64},
         model = "logistic"
         # combine into a DataFrame (note: GLM.jl has to work with DataFrame)
         @assert all((y.==1) .| (y.==0))
-        df = DataFrame(hcat(X, y, makeunique=true))
+        df = DataFrame(isa(X, DataFrame) ? hcat(X, y, makeunique=true) : hcat(X, y))
         for _col in propertynames(df)
             if isa(df[!, _col], CategoricalArray)
                 @assert length(unique(df[!, _col])) == 2 "categorical variable $_col should be recoded to binary"
diff --git a/src/conditionalInvTests.jl b/src/conditionalInvTests.jl
@@ -179,9 +179,10 @@ function conditional_inv_test_logistic(df::DataFrame, target::Symbol, S::Vector{
     return reject, p_value, conf_intervals
 end
 
-function conditional_inv_test_logistic_LR(df::DataFrame, target::Symbol, S::Vector{Int64},
-                                            env::Vector{Int64}, n_env::Int64; α=0.01, add_intercept=true)
-    conditional_inv_test_logistic_LR(df, target, names(df)[S], env, n_env; α=α, add_intercept=add_intercept)                                            
+function conditional_inv_test_logistic(df::DataFrame, target::Symbol, S::Vector{Int64},
+    env::Vector{Int64}, n_env::Int64; α=0.01, add_intercept=true, method="logistic-LR")
+    conditional_inv_test_logistic(df, target, propertynames(df)[S], env, n_env; 
+        α=α, add_intercept=add_intercept, method=method)                                            
 end
 
 """
diff --git a/test/test_search.jl b/test/test_search.jl
@@ -20,8 +20,13 @@ end
                 x3 = [0.0  0.0  1.0  1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  0.0  0.0  1.0  0.0][:],
                 y =  [1.0  1.0  1.0  0.0  0.0  1.0  1.0  1.0  0.0  1.0  1.0  0.0  0.0  1.0  1.0  0.0][:])
     env = repeat([1,2], inner=8)
+    X = Matrix{Float64}(df[!, 1:3])
     r1 = causalSearch(df, :y, env, method="logistic-LR", iterate_all=true)
     @test length(r1.S) == 0
+    r1 = causalSearch(X, df.y, env, method="logistic-LR", iterate_all=true)
+    @test length(r1.S) == 0
     r2 = causalSearch(df, :y, env, method="logistic-SF", iterate_all=true)
     @test length(r2.S) == 0
+    r2 = causalSearch(X, df.y, env, method="logistic-SF", iterate_all=true)
+    @test length(r2.S) == 0
 end