tunib-ai
diff --git a/‎docs/TUTORIALS/tensor_model_parallelism.html
Lines changed: 3 additions & 3 deletions b/‎docs/TUTORIALS/tensor_model_parallelism.html
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/_sources/tutorials/tensor_model_parallelism.md.txt
Lines changed: 3 additions & 3 deletions b/‎docs/_sources/tutorials/tensor_model_parallelism.md.txt
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/searchindex.js
Lines changed: 1 addition & 1 deletion b/‎docs/searchindex.js
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/TUTORIALS/tensor_model_parallelism.md
Lines changed: 3 additions & 3 deletions b/‎docs/source/TUTORIALS/tensor_model_parallelism.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎oslo/__version__.py
Lines changed: 1 addition & 1 deletion b/‎oslo/__version__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎oslo/pytorch/kernel_fusion/mem_efficient/__init__.py
Lines changed: 2 additions & 2 deletions b/‎oslo/pytorch/kernel_fusion/mem_efficient/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎oslo/pytorch/kernel_fusion/mem_efficient/aot_autograd.py
Lines changed: 1 addition & 0 deletions b/‎oslo/pytorch/kernel_fusion/mem_efficient/aot_autograd.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎oslo/pytorch/kernel_fusion/mem_efficient/compilers.py
Lines changed: 13 additions & 3 deletions b/‎oslo/pytorch/kernel_fusion/mem_efficient/compilers.py
Lines changed: 13 additions & 3 deletions
@@ -486,11 +486,11 @@ <h3>1.1. Create model and tokenizer<a class="headerlink" href="#create-model-and
 <div class="section" id="parallelize-the-model">
 <h3>1.2. Parallelize the model<a class="headerlink" href="#parallelize-the-model" title="Permalink to this headline"></a></h3>
 <ul class="simple">
-<li><p><code class="docutils literal notranslate"><span class="pre">tensor_parallel_size</span></code> must be same or smaller than total number of gpus.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">tensor_parallel_size</span></code> must be same or smaller than total num of gpus.</p></li>
 <li><p><code class="docutils literal notranslate"><span class="pre">tensor_parallel_size</span></code> must be power of 2. (e.g. 2, 4, 8, 16, …)</p></li>
 <li><p><code class="docutils literal notranslate"><span class="pre">tensor_parallel_size</span></code> must be positive number.</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">tensor_parallel_size</span></code> must be same or greater than hidden size</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">tensor_parallel_size</span></code> must be same or greater than the number of heads</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">hidden</span> <span class="pre">size</span></code> must be same or greater than <code class="docutils literal notranslate"><span class="pre">tensor_parallel_size</span></code></p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">the</span> <span class="pre">number</span> <span class="pre">of</span> <span class="pre">heads</span></code> must be same or greater than <code class="docutils literal notranslate"><span class="pre">tensor_parallel_size</span></code></p></li>
 </ul>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">oslo</span>
 
 
@@ -49,11 +49,11 @@ tokenizer = AutoTokenizer.from_pretrained("gpt2")
 ```
 
 ### 1.2. Parallelize the model
-- ``tensor_parallel_size`` must be same or smaller than total number of gpus.
+- ``tensor_parallel_size`` must be same or smaller than total num of gpus.
 - ``tensor_parallel_size`` must be power of 2. (e.g. 2, 4, 8, 16, ...)
 - ``tensor_parallel_size`` must be positive number.
-- ``tensor_parallel_size`` must be same or greater than hidden size
-- ``tensor_parallel_size`` must be same or greater than the number of heads
+- ``hidden size`` must be same or greater than ``tensor_parallel_size``
+- ``the number of heads`` must be same or greater than ``tensor_parallel_size``
 
 ```python
 import oslo
 
@@ -49,11 +49,11 @@ tokenizer = AutoTokenizer.from_pretrained("gpt2")
 ```
 
 ### 1.2. Parallelize the model
-- ``tensor_parallel_size`` must be same or smaller than total number of gpus.
+- ``tensor_parallel_size`` must be same or smaller than total num of gpus.
 - ``tensor_parallel_size`` must be power of 2. (e.g. 2, 4, 8, 16, ...)
 - ``tensor_parallel_size`` must be positive number.
-- ``tensor_parallel_size`` must be same or greater than hidden size
-- ``tensor_parallel_size`` must be same or greater than the number of heads
+- ``hidden size`` must be same or greater than ``tensor_parallel_size``
+- ``the number of heads`` must be same or greater than ``tensor_parallel_size``
 
 ```python
 import oslo
 
@@ -1,3 +1,3 @@
 # Copyright 2021 TUNiB Inc.
 
-version = "2.0.0"
+version = "2.0.1"
@@ -1,6 +1,6 @@
 """
 LAST UPSTREAM INFORMATION
 
-- date: 2022/02/14
-- commit: https://github.com/pytorch/functorch/commit/cd41d6ebc0402d94ae6af51f163ee728277a7aa4
+- date: 2022/02/21
+- commit: https://github.com/pytorch/functorch/commit/0c0f325ba3c83e70c215f231cfd810af68141767
 """
@@ -110,6 +110,7 @@ def _reshape_alias(x, shape, strides):
         return aten.view(x, shape)
 
 
+
 def create_aot_autograd_function(
     flat_fn, fw_compiler, bw_compiler, partition_fn, decompositions, grad_state
 ):
 
@@ -13,8 +13,16 @@
 )
 
 
+def _canonicalize(fx_g):
+    for node in fx_g.graph.nodes:
+        if node.target == torch.ops.aten._s_where:
+            node.target = torch.ops.aten.where
+    fx_g.recompile()
+    return fx_g
+
+
 def ts_compile(fx_g, _):
-    # print(fx_g.code)
+    fx_g = _canonicalize(fx_g)
     for node in fx_g.graph.nodes:
         if node.target == torch.ops.aten.new_zeros:
             if node.args[1] == []:
@@ -215,6 +223,7 @@ def nop(f, _):
 
 
 def simple_ts_compile(fx_g, _):
+    fx_g = _canonicalize(fx_g)
     f = torch.jit.script(fx_g)
     f = torch.jit.freeze(f.eval())
     return f
@@ -284,12 +293,13 @@ def debug_compile(fx_g, inps):
 ##############################################################
 import torch
 import torch.fx as fx
-from torch.compile import minimizer, check_nvfuser_subprocess
+from functorch.compile import minifier, check_nvfuser_subprocess
 inps = {[(i.shape, i.dtype) for i in inps]}
+inps = [torch.ones(shape, dtype=dtype, device='cuda') for (shape, dtype) in inps]
 from foo import FxModule
 mod = FxModule().cuda()
 with torch.jit.fuser("fuser2"):
-  minimizer(fx.symbolic_trace(mod), inps, check_nvfuser_subprocess)
+  minifier(fx.symbolic_trace(mod), inps, check_nvfuser_subprocess)
 """
     )
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`# Copyright 2021 TUNiB Inc.`
`2`	`2`
`3`		`-version = "2.0.0"`
	`3`	`+version = "2.0.1"`