From 4ca50775840814b72e96da139d5dccd5691d02f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Wed, 15 Jan 2025 21:14:06 +0800
Subject: [PATCH 1/3] init

---
 docs/developer-guide/operators.md             | 2307 +++++++++--------
 tools/pnnx/src/CMakeLists.txt                 |    1 +
 .../pnnx/src/pass_ncnn/torch_index_select.cpp |   58 +
 3 files changed, 1271 insertions(+), 1095 deletions(-)
 create mode 100644 tools/pnnx/src/pass_ncnn/torch_index_select.cpp

diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
index 10fe1f03f0f6..1c915cf14fdb 100644
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -1,168 +1,177 @@
-
-* [AbsVal](#absval)
-* [ArgMax](#argmax)
-* [BatchNorm](#batchnorm)
-* [Bias](#bias)
-* [BinaryOp](#binaryop)
-* [BNLL](#bnll)
-* [Cast](#cast)
-* [CELU](#celu)
-* [Clip](#clip)
-* [Concat](#concat)
-* [Convolution](#convolution)
-* [Convolution1D](#convolution1d)
-* [Convolution3D](#convolution3d)
-* [ConvolutionDepthWise](#convolutiondepthwise)
-* [ConvolutionDepthWise1D](#convolutiondepthwise1d)
-* [ConvolutionDepthWise3D](#convolutiondepthwise3d)
-* [CopyTo](#copyto)
-* [Crop](#crop)
-* [CumulativeSum](#cumulativesum)
-* [Deconvolution](#deconvolution)
-* [Deconvolution1D](#deconvolution1d)
-* [Deconvolution3D](#deconvolution3d)
-* [DeconvolutionDepthWise](#deconvolutiondepthwise)
-* [DeconvolutionDepthWise1D](#deconvolutiondepthwise1d)
-* [DeconvolutionDepthWise3D](#deconvolutiondepthwise3d)
-* [DeformableConv2D](#deformableconv2d)
-* [Dequantize](#dequantize)
-* [Diag](#diag)
-* [Dropout](#dropout)
-* [Eltwise](#eltwise)
-* [ELU](#elu)
-* [Embed](#embed)
-* [Exp](#exp)
-* [Flatten](#flatten)
-* [Fold](#fold)
-* [GELU](#gelu)
-* [GLU](#glu)
-* [Gemm](#gemm)
-* [GridSample](#gridsample)
-* [GroupNorm](#groupnorm)
-* [GRU](#gru)
-* [HardSigmoid](#hardsigmoid)
-* [HardSwish](#hardswish)
-* [InnerProduct](#innerproduct)
-* [Input](#input)
-* [InstanceNorm](#instancenorm)
-* [Interp](#interp)
-* [InverseSpectrogram](#inversespectrogram)
-* [LayerNorm](#layernorm)
-* [Log](#log)
-* [LRN](#lrn)
-* [LSTM](#lstm)
-* [MemoryData](#memorydata)
-* [Mish](#mish)
-* [MultiHeadAttention](#multiheadattention)
-* [MVN](#mvn)
-* [Noop](#noop)
-* [Normalize](#normalize)
-* [Packing](#packing)
-* [Padding](#padding)
-* [Permute](#permute)
-* [PixelShuffle](#pixelshuffle)
-* [Pooling](#pooling)
-* [Pooling1D](#pooling1d)
-* [Pooling3D](#pooling3d)
-* [Power](#power)
-* [PReLU](#prelu)
-* [Quantize](#quantize)
-* [Reduction](#reduction)
-* [ReLU](#relu)
-* [Reorg](#reorg)
-* [Requantize](#requantize)
-* [Reshape](#reshape)
-* [RMSNorm](#rmsnorm)
-* [RNN](#rnn)
-* [Scale](#scale)
-* [SELU](#selu)
-* [Shrink](#shrink)
-* [ShuffleChannel](#shufflechannel)
-* [Sigmoid](#sigmoid)
-* [Slice](#slice)
-* [Softmax](#softmax)
-* [Softplus](#softplus)
-* [Spectrogram](#spectrogram)
-* [Split](#split)
-* [Swish](#swish)
-* [TanH](#tanh)
-* [Threshold](#threshold)
-* [Tile](#tile)
-* [UnaryOp](#unaryop)
-* [Unfold](#unfold)
+- [AbsVal](#absval)
+- [ArgMax](#argmax)
+- [BatchNorm](#batchnorm)
+- [Bias](#bias)
+- [BinaryOp](#binaryop)
+- [BNLL](#bnll)
+- [Cast](#cast)
+- [CELU](#celu)
+- [Clip](#clip)
+- [Concat](#concat)
+- [Convolution](#convolution)
+- [Convolution1D](#convolution1d)
+- [Convolution3D](#convolution3d)
+- [ConvolutionDepthWise](#convolutiondepthwise)
+- [ConvolutionDepthWise1D](#convolutiondepthwise1d)
+- [ConvolutionDepthWise3D](#convolutiondepthwise3d)
+- [CopyTo](#copyto)
+- [Crop](#crop)
+- [CumulativeSum](#cumulativesum)
+- [Deconvolution](#deconvolution)
+- [Deconvolution1D](#deconvolution1d)
+- [Deconvolution3D](#deconvolution3d)
+- [DeconvolutionDepthWise](#deconvolutiondepthwise)
+- [DeconvolutionDepthWise1D](#deconvolutiondepthwise1d)
+- [DeconvolutionDepthWise3D](#deconvolutiondepthwise3d)
+- [DeformableConv2D](#deformableconv2d)
+- [Dequantize](#dequantize)
+- [Diag](#diag)
+- [Dropout](#dropout)
+- [Eltwise](#eltwise)
+- [ELU](#elu)
+- [Embed](#embed)
+- [Exp](#exp)
+- [Flatten](#flatten)
+- [Fold](#fold)
+- [GELU](#gelu)
+- [GLU](#glu)
+- [Gemm](#gemm)
+- [GridSample](#gridsample)
+- [GroupNorm](#groupnorm)
+- [GRU](#gru)
+- [HardSigmoid](#hardsigmoid)
+- [HardSwish](#hardswish)
+- [IndexSelect](#indexselect)
+- [InnerProduct](#innerproduct)
+- [Input](#input)
+- [InstanceNorm](#instancenorm)
+- [Interp](#interp)
+- [InverseSpectrogram](#inversespectrogram)
+- [LayerNorm](#layernorm)
+- [Log](#log)
+- [LRN](#lrn)
+- [LSTM](#lstm)
+- [MemoryData](#memorydata)
+- [Mish](#mish)
+- [MultiHeadAttention](#multiheadattention)
+- [MVN](#mvn)
+- [Noop](#noop)
+- [Normalize](#normalize)
+- [Packing](#packing)
+- [Padding](#padding)
+- [Permute](#permute)
+- [PixelShuffle](#pixelshuffle)
+- [Pooling](#pooling)
+- [Pooling1D](#pooling1d)
+- [Pooling3D](#pooling3d)
+- [Power](#power)
+- [PReLU](#prelu)
+- [Quantize](#quantize)
+- [Reduction](#reduction)
+- [ReLU](#relu)
+- [Reorg](#reorg)
+- [Requantize](#requantize)
+- [Reshape](#reshape)
+- [RMSNorm](#rmsnorm)
+- [RNN](#rnn)
+- [Scale](#scale)
+- [SELU](#selu)
+- [Shrink](#shrink)
+- [ShuffleChannel](#shufflechannel)
+- [Sigmoid](#sigmoid)
+- [Slice](#slice)
+- [Softmax](#softmax)
+- [Softplus](#softplus)
+- [Spectrogram](#spectrogram)
+- [Split](#split)
+- [Swish](#swish)
+- [TanH](#tanh)
+- [Threshold](#threshold)
+- [Tile](#tile)
+- [UnaryOp](#unaryop)
+- [Unfold](#unfold)
 
 # AbsVal
+
 ```
 y = abs(x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # ArgMax
+
 ```
 y = argmax(x, out_max_val, topk)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | out_max_val   | int   | 0         |                   |
-| 1         | topk          | int   | 1         |                   |
+| param id | name        | type | default | description |
+| -------- | ----------- | ---- | ------- | ----------- |
+| 0        | out_max_val | int  | 0       |             |
+| 1        | topk        | int  | 1       |             |
 
 # BatchNorm
+
 ```
 y = (x - mean) / sqrt(var + eps) * slope + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | channels      | int   | 0         |                   |
-| 1         | eps           | float | 0.f       |                   |
+| param id | name     | type  | default | description |
+| -------- | -------- | ----- | ------- | ----------- |
+| 0        | channels | int   | 0       |             |
+| 1        | eps      | float | 0.f     |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| slope_data    | float | [channels]            |
-| mean_data     | float | [channels]            |
-| var_data      | float | [channels]            |
-| bias_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| slope_data | float | [channels] |
+| mean_data  | float | [channels] |
+| var_data   | float | [channels] |
+| bias_data  | float | [channels] |
 
 # Bias
+
 ```
 y = x + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | bias_data_size| int   | 0         |                   |
+| param id | name           | type | default | description |
+| -------- | -------------- | ---- | ------- | ----------- |
+| 0        | bias_data_size | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| bias_data     | float | [channels]            |
+| weight    | type  | shape      |
+| --------- | ----- | ---------- |
+| bias_data | float | [channels] |
 
 # BinaryOp
- This operation is used for binary computation, and the calculation rule depends on the [broadcasting rule](https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting).
+
+This operation is used for binary computation, and the calculation rule depends on the [broadcasting rule](https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting).
+
 ```
 C = binaryop(A, B)
 ```
+
 if with_scalar = 1:
+
 - one_blob_only
 - support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         | Operation type as follows |
-| 1         | with_scalar   | int   | 0         | with_scalar=0 B is a matrix, with_scalar=1 B is a scalar |
-| 2         | b             | float | 0.f       | When B is a scalar, B = b |
+| param id | name        | type  | default | description                                              |
+| -------- | ----------- | ----- | ------- | -------------------------------------------------------- |
+| 0        | op_type     | int   | 0       | Operation type as follows                                |
+| 1        | with_scalar | int   | 0       | with_scalar=0 B is a matrix, with_scalar=1 B is a scalar |
+| 2        | b           | float | 0.f     | When B is a scalar, B = b                                |
 
 Operation type:
+
 - 0 = ADD
 - 1 = SUB
 - 2 = MUL
@@ -177,28 +186,31 @@ Operation type:
 - 11 = RATAN2
 
 # BNLL
+
 ```
 y = log(1 + e^(-x)) , x > 0
 y = log(1 + e^x),     x < 0
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Cast
+
 ```
 y = cast(x)
 ```
 
-* one_blob_only
-* support_packing
+- one_blob_only
+- support_packing
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | type_from     | int   | 0         |                   |
-| 1         | type_to       | int   | 0         |                   |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | type_from | int  | 0       |             |
+| 1        | type_to   | int  | 0       |             |
 
 Element type:
+
 - 0 = auto
 - 1 = float32
 - 2 = float16
@@ -206,702 +218,730 @@ Element type:
 - 4 = bfloat16
 
 # CELU
+
 ```
 if x < 0    y = (exp(x / alpha) - 1.f) * alpha
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 1.f     |             |
 
 # Clip
+
 ```
 y = clamp(x, min, max)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | min           | float | -FLT_MAX  |                   |
-| 1         | max           | float | FLT_MAX   |                   |
+| param id | name | type  | default  | description |
+| -------- | ---- | ----- | -------- | ----------- |
+| 0        | min  | float | -FLT_MAX |             |
+| 1        | max  | float | FLT_MAX  |             |
 
 # Concat
+
 ```
 y = concat(x0, x1, x2, ...) by axis
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Convolution
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [num_output] |
-| bottom_blob_int8_scales| float | [1]          |
-| top_blob_int8_scales| float | [1]             |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 8        | int8_scale_term   | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 19       | dynamic_weight    | int   | 0          |             |
+
+| weight                  | type            | shape                                       |
+| ----------------------- | --------------- | ------------------------------------------- |
+| weight_data             | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data               | float           | [num_output]                                |
+| weight_data_int8_scales | float           | [num_output]                                |
+| bottom_blob_int8_scales | float           | [1]                                         |
+| top_blob_int8_scales    | float           | [1]                                         |
 
 # Convolution1D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv1d(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | pad_value         | float | 0.f      |             |
+| 19       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type            | shape                             |
+| ----------- | --------------- | --------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, num_input, num_output] |
+| bias_data   | float           | [num_output]                      |
 
 # Convolution3D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv3d(x2, weight, kernel, stride, dilation) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 17       | pad_behind        | int   | pad_front  |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 21       | kernel_d          | int   | kernel_w   |             |
+| 22       | dilation_d        | int   | dilation_w |             |
+| 23       | stride_d          | int   | stride_w   |             |
+| 24       | pad_front         | int   | pad_left   |             |
+
+| weight      | type            | shape                                                 |
+| ----------- | --------------- | ----------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
+| bias_data   | float           | [num_output]                                          |
 
 # ConvolutionDepthWise
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [group]      |
-| bottom_blob_int8_scales| float | [1]          |
-| top_blob_int8_scales| float | [1]             |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 7        | group             | int   | 1          |             |
+| 8        | int8_scale_term   | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 19       | dynamic_weight    | int   | 0          |             |
+
+| weight                  | type            | shape                                                              |
+| ----------------------- | --------------- | ------------------------------------------------------------------ |
+| weight_data             | float/fp16/int8 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
+| bias_data               | float           | [num_output]                                                       |
+| weight_data_int8_scales | float           | [group]                                                            |
+| bottom_blob_int8_scales | float           | [1]                                                                |
+| top_blob_int8_scales    | float           | [1]                                                                |
 
 # ConvolutionDepthWise1D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv1d(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 19        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 7        | group             | int   | 1        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | pad_value         | float | 0.f      |             |
+| 19       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type            | shape                                                    |
+| ----------- | --------------- | -------------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, num_input / group, num_output / group, group] |
+| bias_data   | float           | [num_output]                                             |
 
 # ConvolutionDepthWise3D
+
 ```
 x2 = pad(x, pads, pad_value)
 x3 = conv3d(x2, weight, kernel, stride, dilation, group) + bias
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | pad_value     | float | 0.f       |                   |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 7        | group             | int   | 1          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+| 17       | pad_behind        | int   | pad_front  |             |
+| 18       | pad_value         | float | 0.f        |             |
+| 21       | kernel_d          | int   | kernel_w   |             |
+| 22       | dilation_d        | int   | dilation_w |             |
+| 23       | stride_d          | int   | stride_w   |             |
+| 24       | pad_front         | int   | pad_left   |             |
+
+| weight      | type            | shape                                                                        |
+| ----------- | --------------- | ---------------------------------------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
+| bias_data   | float           | [num_output]                                                                 |
 
 # CopyTo
+
 ```
 self[offset] = src
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | woffset       | int   | 0         |                   |
-| 1         | hoffset       | int   | 0         |                   |
-| 13        | doffset       | int   | 0         |                   |
-| 2         | coffset       | int   | 0         |                   |
-| 9         | starts        | array | [ ]       |                   |
-| 11        | axes          | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | woffset | int   | 0       |             |
+| 1        | hoffset | int   | 0       |             |
+| 13       | doffset | int   | 0       |             |
+| 2        | coffset | int   | 0       |             |
+| 9        | starts  | array | [ ]     |             |
+| 11       | axes    | array | [ ]     |             |
 
 # Crop
+
 ```
 y = crop(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | woffset       | int   | 0         |                   |
-| 1         | hoffset       | int   | 0         |                   |
-| 13        | doffset       | int   | 0         |                   |
-| 2         | coffset       | int   | 0         |                   |
-| 3         | outw          | int   | 0         |                   |
-| 4         | outh          | int   | 0         |                   |
-| 14        | outd          | int   | 0         |                   |
-| 5         | outc          | int   | 0         |                   |
-| 6         | woffset2      | int   | 0         |                   |
-| 7         | hoffset2      | int   | 0         |                   |
-| 15        | doffset2      | int   | 0         |                   |
-| 8         | coffset2      | int   | 0         |                   |
-| 9         | starts        | array | [ ]       |                   |
-| 10        | ends          | array | [ ]       |                   |
-| 11        | axes          | array | [ ]       |                   |
+- one_blob_only
+
+| param id | name     | type  | default | description |
+| -------- | -------- | ----- | ------- | ----------- |
+| 0        | woffset  | int   | 0       |             |
+| 1        | hoffset  | int   | 0       |             |
+| 13       | doffset  | int   | 0       |             |
+| 2        | coffset  | int   | 0       |             |
+| 3        | outw     | int   | 0       |             |
+| 4        | outh     | int   | 0       |             |
+| 14       | outd     | int   | 0       |             |
+| 5        | outc     | int   | 0       |             |
+| 6        | woffset2 | int   | 0       |             |
+| 7        | hoffset2 | int   | 0       |             |
+| 15       | doffset2 | int   | 0       |             |
+| 8        | coffset2 | int   | 0       |             |
+| 9        | starts   | array | [ ]     |             |
+| 10       | ends     | array | [ ]     |             |
+| 11       | axes     | array | [ ]     |             |
 
 # CumulativeSum
 
 If axis < 0, we use axis = x.dims + axis
 
-It implements https://pytorch.org/docs/stable/generated/torch.cumsum.html
+It implements <https://pytorch.org/docs/stable/generated/torch.cumsum.html>
 
-* one_blob_only
-* support_inplace
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+- one_blob_only
+- support_inplace
 
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Deconvolution
+
 ```
 x2 = deconv(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_w          | int   | 0                |             |
+| 21       | output_h          | int   | output_w         |             |
+| 28       | dynamic_weight    | int   | 0                |             |
+
+| weight      | type       | shape                                       |
+| ----------- | ---------- | ------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data   | float      | [num_output]                                |
 
 # Deconvolution1D
+
 ```
 x2 = deconv1d(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | output_pad_right  | int   | 0        |             |
+| 20       | output_w          | int   | 0        |             |
+| 28       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type       | shape                             |
+| ----------- | ---------- | --------------------------------- |
+| weight_data | float/fp16 | [kernel_w, num_input, num_output] |
+| bias_data   | float      | [num_output]                      |
 
 # Deconvolution3D
+
 ```
 x2 = deconv3d(x, weight, kernel, stride, dilation) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_pad_behind| int | output_pad_right |           |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-| 25        | output_w      | int   | 0         |                   |
-| 26        | output_h      | int   | output_w  |                   |
-| 27        | output_d      | int   | output_w  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 17       | pad_behind        | int   | pad_front        |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_pad_behind | int   | output_pad_right |             |
+| 21       | kernel_d          | int   | kernel_w         |             |
+| 22       | dilation_d        | int   | dilation_w       |             |
+| 23       | stride_d          | int   | stride_w         |             |
+| 24       | pad_front         | int   | pad_left         |             |
+| 25       | output_w          | int   | 0                |             |
+| 26       | output_h          | int   | output_w         |             |
+| 27       | output_d          | int   | output_w         |             |
+
+| weight      | type       | shape                                                 |
+| ----------- | ---------- | ----------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
+| bias_data   | float      | [num_output]                                          |
 
 # DeconvolutionDepthWise
+
 ```
 x2 = deconv(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 7        | group             | int   | 1                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_w          | int   | 0                |             |
+| 21       | output_h          | int   | output_w         |             |
+| 28       | dynamic_weight    | int   | 0                |             |
+
+| weight      | type       | shape                                                              |
+| ----------- | ---------- | ------------------------------------------------------------------ |
+| weight_data | float/fp16 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                                       |
 
 # DeconvolutionDepthWise1D
+
 ```
 x2 = deconv1d(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 28        | dynamic_weight| int   | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default  | description |
+| -------- | ----------------- | ----- | -------- | ----------- |
+| 0        | num_output        | int   | 0        |             |
+| 1        | kernel_w          | int   | 0        |             |
+| 2        | dilation_w        | int   | 1        |             |
+| 3        | stride_w          | int   | 1        |             |
+| 4        | pad_left          | int   | 0        |             |
+| 5        | bias_term         | int   | 0        |             |
+| 6        | weight_data_size  | int   | 0        |             |
+| 7        | group             | int   | 1        |             |
+| 9        | activation_type   | int   | 0        |             |
+| 10       | activation_params | array | [ ]      |             |
+| 15       | pad_right         | int   | pad_left |             |
+| 18       | output_pad_right  | int   | 0        |             |
+| 20       | output_w          | int   | 0        |             |
+| 28       | dynamic_weight    | int   | 0        |             |
+
+| weight      | type       | shape                                                    |
+| ----------- | ---------- | -------------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                             |
 
 # DeconvolutionDepthWise3D
+
 ```
 x2 = deconv3d(x, weight, kernel, stride, dilation, group) + bias
 x3 = depad(x2, pads, pad_value)
 y = activation(x3, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 7         | group         | int   | 1         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 17        | pad_behind    | int   | pad_front |                   |
-| 18        | output_pad_right| int | 0         |                   |
-| 19        | output_pad_bottom| int | output_pad_right |           |
-| 20        | output_pad_behind| int | output_pad_right |           |
-| 21        | kernel_d      | int   | kernel_w  |                   |
-| 22        | dilation_d    | int   | dilation_w |                  |
-| 23        | stride_d      | int   | stride_w  |                   |
-| 24        | pad_front     | int   | pad_left  |                   |
-| 25        | output_w      | int   | 0         |                   |
-| 26        | output_h      | int   | output_w  |                   |
-| 27        | output_d      | int   | output_w  |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
-| bias_data     | float | [num_output]          |
+- one_blob_only
+
+| param id | name              | type  | default          | description |
+| -------- | ----------------- | ----- | ---------------- | ----------- |
+| 0        | num_output        | int   | 0                |             |
+| 1        | kernel_w          | int   | 0                |             |
+| 2        | dilation_w        | int   | 1                |             |
+| 3        | stride_w          | int   | 1                |             |
+| 4        | pad_left          | int   | 0                |             |
+| 5        | bias_term         | int   | 0                |             |
+| 6        | weight_data_size  | int   | 0                |             |
+| 7        | group             | int   | 1                |             |
+| 9        | activation_type   | int   | 0                |             |
+| 10       | activation_params | array | [ ]              |             |
+| 11       | kernel_h          | int   | kernel_w         |             |
+| 12       | dilation_h        | int   | dilation_w       |             |
+| 13       | stride_h          | int   | stride_w         |             |
+| 14       | pad_top           | int   | pad_left         |             |
+| 15       | pad_right         | int   | pad_left         |             |
+| 16       | pad_bottom        | int   | pad_top          |             |
+| 17       | pad_behind        | int   | pad_front        |             |
+| 18       | output_pad_right  | int   | 0                |             |
+| 19       | output_pad_bottom | int   | output_pad_right |             |
+| 20       | output_pad_behind | int   | output_pad_right |             |
+| 21       | kernel_d          | int   | kernel_w         |             |
+| 22       | dilation_d        | int   | dilation_w       |             |
+| 23       | stride_d          | int   | stride_w         |             |
+| 24       | pad_front         | int   | pad_left         |             |
+| 25       | output_w          | int   | 0                |             |
+| 26       | output_h          | int   | output_w         |             |
+| 27       | output_d          | int   | output_w         |             |
+
+| weight      | type       | shape                                                                        |
+| ----------- | ---------- | ---------------------------------------------------------------------------- |
+| weight_data | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
+| bias_data   | float      | [num_output]                                                                 |
 
 # DeformableConv2D
+
 ```
 x2 = deformableconv2d(x, offset, mask, weight, kernel, stride, dilation) + bias
 y = activation(x2, act_type, act_params)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 5         | bias_term     | int   | 0         |                   |
-| 6         | weight_data_size| int | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
-| bias_data     | float | [num_output]          |
+| param id | name              | type  | default    | description |
+| -------- | ----------------- | ----- | ---------- | ----------- |
+| 0        | num_output        | int   | 0          |             |
+| 1        | kernel_w          | int   | 0          |             |
+| 2        | dilation_w        | int   | 1          |             |
+| 3        | stride_w          | int   | 1          |             |
+| 4        | pad_left          | int   | 0          |             |
+| 5        | bias_term         | int   | 0          |             |
+| 6        | weight_data_size  | int   | 0          |             |
+| 9        | activation_type   | int   | 0          |             |
+| 10       | activation_params | array | [ ]        |             |
+| 11       | kernel_h          | int   | kernel_w   |             |
+| 12       | dilation_h        | int   | dilation_w |             |
+| 13       | stride_h          | int   | stride_w   |             |
+| 14       | pad_top           | int   | pad_left   |             |
+| 15       | pad_right         | int   | pad_left   |             |
+| 16       | pad_bottom        | int   | pad_top    |             |
+
+| weight      | type            | shape                                       |
+| ----------- | --------------- | ------------------------------------------- |
+| weight_data | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
+| bias_data   | float           | [num_output]                                |
 
 # Dequantize
+
 ```
 y = x * scale + bias
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 1         |                   |
-| 1         | bias_data_size| int   | 0         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 1       |             |
+| 1        | bias_data_size  | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
-| bias_data     | float | [bias_data_size]      |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
+| bias_data  | float | [bias_data_size]  |
 
 # Diag
+
 ```
 y = diag(x, diagonal)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | diagonal      | int   | 0         |                   |
+| param id | name     | type | default | description |
+| -------- | -------- | ---- | ------- | ----------- |
+| 0        | diagonal | int  | 0       |             |
 
 # Dropout
+
 ```
 y = x * scale
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale         | float | 1.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | scale | float | 1.f     |             |
 
 # Eltwise
+
 ```
 y = elementwise_op(x0, x1, ...)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         |                   |
-| 1         | coeffs        | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | op_type | int   | 0       |             |
+| 1        | coeffs  | array | [ ]     |             |
 
 Operation type:
+
 - 0 = PROD
 - 1 = SUM
 - 2 = MAX
 
 # ELU
+
 ```
 if x < 0    y = (exp(x) - 1) * alpha
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.1f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.1f    |             |
 
 # Embed
+
 ```
 y = embedding(x)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | input_dim     | int   | 0         |                   |
-| 2         | bias_term     | int   | 0         |                   |
-| 3         | weight_data_size | int | 0        |                   |
-| 18        | int8_scale_term| int  | 0         |                   |
+| param id | name             | type | default | description |
+| -------- | ---------------- | ---- | ------- | ----------- |
+| 0        | num_output       | int  | 0       |             |
+| 1        | input_dim        | int  | 0       |             |
+| 2        | bias_term        | int  | 0       |             |
+| 3        | weight_data_size | int  | 0       |             |
+| 18       | int8_scale_term  | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float | [weight_data_size]    |
-| bias_term     | float | [num_output]          |
-| weight_data_int8_scales| float | [1]          |
+| weight                  | type  | shape              |
+| ----------------------- | ----- | ------------------ |
+| weight_data             | float | [weight_data_size] |
+| bias_term               | float | [num_output]       |
+| weight_data_int8_scales | float | [1]                |
 
 # Exp
+
 ```
 if base == -1   y = exp(shift + x * scale)
 else            y = pow(base, (shift + x * scale))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | base          | float | -1.f      |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | base  | float | -1.f    |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # Flatten
+
 Reshape blob to 1 dimension
 
-* one_blob_only
+- one_blob_only
 
 # Fold
+
 ```
 y = fold(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
-| 20        | output_w      | int   | 0         |                   |
-| 21        | output_h      | int   | output_w  |                   |
+- one_blob_only
+
+| param id | name       | type | default    | description |
+| -------- | ---------- | ---- | ---------- | ----------- |
+| 0        | num_output | int  | 0          |             |
+| 1        | kernel_w   | int  | 0          |             |
+| 2        | dilation_w | int  | 1          |             |
+| 3        | stride_w   | int  | 1          |             |
+| 4        | pad_left   | int  | 0          |             |
+| 11       | kernel_h   | int  | kernel_w   |             |
+| 12       | dilation_h | int  | dilation_w |             |
+| 13       | stride_h   | int  | stride_w   |             |
+| 14       | pad_top    | int  | pad_left   |             |
+| 15       | pad_right  | int  | pad_left   |             |
+| 16       | pad_bottom | int  | pad_top    |             |
+| 20       | output_w   | int  | 0          |             |
+| 21       | output_h   | int  | output_w   |             |
 
 # GELU
+
 ```
 if fast_gelu == 1   y = 0.5 * x * (1 + tanh(0.79788452 * (x + 0.044715 * x * x * x)));
 else                y = 0.5 * x * erfc(-0.70710678 * x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | fast_gelu     | int   | 0         | use approximation |
+| param id | name      | type | default | description       |
+| -------- | --------- | ---- | ------- | ----------------- |
+| 0        | fast_gelu | int  | 0       | use approximation |
 
 # GLU
 
@@ -913,13 +953,14 @@ where a is the first half of the input matrix and b is the second half.
 
 axis specifies the dimension to split the input
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | axis | int  | 0       |             |
 
 # Gemm
+
 ```
 a = transA ? transpose(x0) : x0
 b = transb ? transpose(x1) : x1
@@ -927,88 +968,91 @@ c = x2
 y = (gemm(a, b) + c * beta) * alpha
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.f       |                   |
-| 1         | beta          | float | 1.f       |                   |
-| 2         | transA        | int   | 0         |                   |
-| 3         | transb        | int   | 0         |                   |
-| 4         | constantA     | int   | 0         |                   |
-| 5         | constantB     | int   | 0         |                   |
-| 6         | constantC     | int   | 0         |                   |
-| 7         | constantM     | int   | 0         |                   |
-| 8         | constantN     | int   | 0         |                   |
-| 9         | constantK     | int   | 0         |                   |
-| 10        | constant_broadcast_type_C | int | 0 |                 |
-| 11        | output_N1M    | int   | 0         |                   |
-| 12        | output_elempack | int | 0         |                   |
-| 13        | output_elemtype | int | 0         |                   |
-| 14        | output_transpose | int| 0         |                   |
-| 18        | int8_scale_term | int | 0         |                   |
-| 20        | constant_TILE_M | int | 0         |                   |
-| 21        | constant_TILE_N | int | 0         |                   |
-| 22        | constant_TILE_K | int | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| A_data        | float/fp16/int8 | [M, K] or [K, M] |
-| B_data        | float/fp16/int8 | [N, K] or [K, N] |
-| C_data        | float | [1], [M] or [N] or [1, M] or [N,1] or [N, M] |
-| A_data_int8_scales| float | [M]               |
-| B_data_int8_scales| float | [1]               |
+| param id | name                      | type  | default | description |
+| -------- | ------------------------- | ----- | ------- | ----------- |
+| 0        | alpha                     | float | 1.f     |             |
+| 1        | beta                      | float | 1.f     |             |
+| 2        | transA                    | int   | 0       |             |
+| 3        | transb                    | int   | 0       |             |
+| 4        | constantA                 | int   | 0       |             |
+| 5        | constantB                 | int   | 0       |             |
+| 6        | constantC                 | int   | 0       |             |
+| 7        | constantM                 | int   | 0       |             |
+| 8        | constantN                 | int   | 0       |             |
+| 9        | constantK                 | int   | 0       |             |
+| 10       | constant_broadcast_type_C | int   | 0       |             |
+| 11       | output_N1M                | int   | 0       |             |
+| 12       | output_elempack           | int   | 0       |             |
+| 13       | output_elemtype           | int   | 0       |             |
+| 14       | output_transpose          | int   | 0       |             |
+| 18       | int8_scale_term           | int   | 0       |             |
+| 20       | constant_TILE_M           | int   | 0       |             |
+| 21       | constant_TILE_N           | int   | 0       |             |
+| 22       | constant_TILE_K           | int   | 0       |             |
+
+| weight             | type            | shape                                        |
+| ------------------ | --------------- | -------------------------------------------- |
+| A_data             | float/fp16/int8 | [M, K] or [K, M]                             |
+| B_data             | float/fp16/int8 | [N, K] or [K, N]                             |
+| C_data             | float           | [1], [M] or [N] or [1, M] or [N,1] or [N, M] |
+| A_data_int8_scales | float           | [M]                                          |
+| B_data_int8_scales | float           | [1]                                          |
 
 # GridSample
+
 ```
 Given an input and a flow-field grid, computes the output using input values and pixel locations from grid.
 
-For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y, 
+For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y,
 which are used to interpolate the output value output[:, h2, w2]
 
 This function is often used in conjunction with affine_grid() to build Spatial Transformer Networks .
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | sample_type   | int   | 1         |                   |
-| 1         | padding_mode  | int   | 1         |                   |
-| 2         | align_corner  | int   | 0         |                   |
-| 3         | permute_fusion| int   | 0         | fuse with permute |
-
+| param id | name           | type | default | description       |
+| -------- | -------------- | ---- | ------- | ----------------- |
+| 0        | sample_type    | int  | 1       |                   |
+| 1        | padding_mode   | int  | 1       |                   |
+| 2        | align_corner   | int  | 0       |                   |
+| 3        | permute_fusion | int  | 0       | fuse with permute |
 
 Sample type:
+
 - 1 = Nearest
 - 2 = Bilinear
 - 3 = Bicubic
 
 Padding mode:
+
 - 1 = zeros
 - 2 = border
 - 3 = reflection
 
-
 # GroupNorm
+
 ```
 split x along channel axis into group x0, x1 ...
 l2 normalize for each group x0, x1 ...
 y = x * gamma + beta
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | group         | int   | 1         |                   |
-| 1         | channels      | int   | 0         |                   |
-| 2         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 3         | affine        | int   | 1         |                   |
+| param id | name     | type  | default | description             |
+| -------- | -------- | ----- | ------- | ----------------------- |
+| 0        | group    | int   | 1       |                         |
+| 1        | channels | int   | 0       |                         |
+| 2        | eps      | float | 0.001f  | x = x / sqrt(var + eps) |
+| 3        | affine   | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [channels]            |
-| beta_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| gamma_data | float | [channels] |
+| beta_data  | float | [channels] |
 
 # GRU
+
 Apply a single-layer GRU to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1016,134 +1060,149 @@ y = gru(x)
 y0, hidden y1 = gru(x0, hidden x1)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | hidden size of output |
-| 1         | weight_data_size| int | 0         | total size of weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
+| param id | name             | type | default | description                           |
+| -------- | ---------------- | ---- | ------- | ------------------------------------- |
+| 0        | num_output       | int  | 0       | hidden size of output                 |
+| 1        | weight_data_size | int  | 0       | total size of weight matrix           |
+| 2        | direction        | int  | 0       | 0=forward, 1=reverse, 2=bidirectional |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, num_output * 3, num_directions] |
-| bias_c_data   | float/fp16/int8 | [num_output, 4, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, num_output * 3, num_directions] |
+| weight         | type            | shape                                        |
+| -------------- | --------------- | -------------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, num_output * 3, num_directions] |
+| bias_c_data    | float/fp16/int8 | [num_output, 4, num_directions]              |
+| weight_hc_data | float/fp16/int8 | [num_output, num_output * 3, num_directions] |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # HardSigmoid
+
 ```
 y = clamp(x * alpha + beta, 0, 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.2f      |                   |
-| 1         | beta          | float | 0.5f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.2f    |             |
+| 1        | beta  | float | 0.5f    |             |
 
 # HardSwish
+
 ```
 y = x * clamp(x * alpha + beta, 0, 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
+
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | alpha | float | 0.2f    |             |
+| 1        | beta  | float | 0.5f    |             |
+
+# IndexSelect
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 0.2f      |                   |
-| 1         | beta          | float | 0.5f      |                   |
+| param id | name  | type | default | description |
+| -------- | ----- | ---- | ------- | ----------- |
+| 0        | alpha | int  | 0       |             |
 
 # InnerProduct
+
 ```
 x2 = innerproduct(x, weight) + bias
 y = activation(x2, act_type, act_params)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | bias_term     | int   | 0         |                   |
-| 2         | weight_data_size| int | 0         |                   |
-| 8         | int8_scale_term| int  | 0         |                   |
-| 9         | activation_type| int  | 0         |                   |
-| 10        | activation_params| array | [ ]    |                   |
+- one_blob_only
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_data   | float/fp16/int8 | [num_input, num_output] |
-| bias_data     | float | [num_output]          |
-| weight_data_int8_scales| float | [num_output] |
-| bottom_blob_int8_scales| float | [1]          |
+| param id | name              | type  | default | description |
+| -------- | ----------------- | ----- | ------- | ----------- |
+| 0        | num_output        | int   | 0       |             |
+| 1        | bias_term         | int   | 0       |             |
+| 2        | weight_data_size  | int   | 0       |             |
+| 8        | int8_scale_term   | int   | 0       |             |
+| 9        | activation_type   | int   | 0       |             |
+| 10       | activation_params | array | [ ]     |             |
+
+| weight                  | type            | shape                   |
+| ----------------------- | --------------- | ----------------------- |
+| weight_data             | float/fp16/int8 | [num_input, num_output] |
+| bias_data               | float           | [num_output]            |
+| weight_data_int8_scales | float           | [num_output]            |
+| bottom_blob_int8_scales | float           | [1]                     |
 
 # Input
+
 ```
 y = input
 ```
 
-* support_inplace
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | 0         |                   |
-| 1         | h             | int   | 0         |                   |
-| 11        | d             | int   | 0         |                   |
-| 2         | c             | int   | 0         |                   |
+| param id | name | type | default | description |
+| -------- | ---- | ---- | ------- | ----------- |
+| 0        | w    | int  | 0       |             |
+| 1        | h    | int  | 0       |             |
+| 11       | d    | int  | 0       |             |
+| 2        | c    | int  | 0       |             |
 
 # InstanceNorm
+
 ```
 split x along channel axis into instance x0, x1 ...
 l2 normalize for each channel instance x0, x1 ...
 y = x * gamma + beta
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | channels      | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name     | type  | default | description             |
+| -------- | -------- | ----- | ------- | ----------------------- |
+| 0        | channels | int   | 0       |                         |
+| 1        | eps      | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine   | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [channels]            |
-| beta_data     | float | [channels]            |
+| weight     | type  | shape      |
+| ---------- | ----- | ---------- |
+| gamma_data | float | [channels] |
+| beta_data  | float | [channels] |
 
 # Interp
+
 ```
 if dynamic_target_size == 0     y = resize(x) by fixed size or scale
 else                            y = resize(x0, size(x1))
 ```
 
-* one_blob_only if dynamic_target_size == 0
+- one_blob_only if dynamic_target_size == 0
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | resize_type   | int   | 0         |                   |
-| 1         | height_scale  | float | 1.f       |                   |
-| 2         | width_scale   | float | 1.f       |                   |
-| 3         | output_height | int   | 0         |                   |
-| 4         | output_width  | int   | 0         |                   |
-| 5         | dynamic_target_size| int | 0      |                   |
-| 6         | align_corner  | int   | 0         |                   |
+| param id | name                | type  | default | description |
+| -------- | ------------------- | ----- | ------- | ----------- |
+| 0        | resize_type         | int   | 0       |             |
+| 1        | height_scale        | float | 1.f     |             |
+| 2        | width_scale         | float | 1.f     |             |
+| 3        | output_height       | int   | 0       |             |
+| 4        | output_width        | int   | 0       |             |
+| 5        | dynamic_target_size | int   | 0       |             |
+| 6        | align_corner        | int   | 0       |             |
 
 Resize type:
+
 - 1 = Nearest
 - 2 = Bilinear
 - 3 = Bicubic
 
 # InverseSpectrogram
+
 ```
 x1 = x as complex
 x1 = x1 * sqrt(norm) if normalized
@@ -1155,77 +1214,82 @@ if returns == 1 return y1 real
 if returns == 2 return y1 imag
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | n_fft         | int   | 0         |                   |
-| 1         | returns       | int   | 1         |                   |
-| 2         | hoplen        | int   | n_fft / 4 |                   |
-| 3         | winlen        | int   | n_fft     |                   |
-| 4         | window_type   | int   | 0         | 0=ones 1=hann 2=hamming |
-| 5         | center        | int   | 1         |                   |
-| 7         | normalized    | int   | 0         | 0=no 1=n_fft 2=window-l2-energy |
+| param id | name        | type | default   | description                     |
+| -------- | ----------- | ---- | --------- | ------------------------------- |
+| 0        | n_fft       | int  | 0         |                                 |
+| 1        | returns     | int  | 1         |                                 |
+| 2        | hoplen      | int  | n_fft / 4 |                                 |
+| 3        | winlen      | int  | n_fft     |                                 |
+| 4        | window_type | int  | 0         | 0=ones 1=hann 2=hamming         |
+| 5        | center      | int  | 1         |                                 |
+| 7        | normalized  | int  | 0         | 0=no 1=n_fft 2=window-l2-energy |
 
 # LayerNorm
+
 ```
 split x along outmost axis into part x0, x1 ...
 l2 normalize for each part x0, x1 ...
 y = x * gamma + beta by elementwise
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | affine_size   | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name        | type  | default | description             |
+| -------- | ----------- | ----- | ------- | ----------------------- |
+| 0        | affine_size | int   | 0       |                         |
+| 1        | eps         | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine      | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [affine_size]         |
-| beta_data     | float | [affine_size]         |
+| weight     | type  | shape         |
+| ---------- | ----- | ------------- |
+| gamma_data | float | [affine_size] |
+| beta_data  | float | [affine_size] |
 
 # Log
+
 ```
 if base == -1   y = log(shift + x * scale)
 else            y = log(shift + x * scale) / log(base)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | base          | float | -1.f      |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | base  | float | -1.f    |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # LRN
+
 ```
 if region_type == ACROSS_CHANNELS   square_sum = sum of channel window of local_size
 if region_type == WITHIN_CHANNEL    square_sum = sum of spatial window of local_size
 y = x * pow(bias + alpha * square_sum / (local_size * local_size), -beta)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | region_type   | int   | 0         |                   |
-| 1         | local_size    | int   | 5         |                   |
-| 2         | alpha         | float | 1.f       |                   |
-| 3         | beta          | float | 0.75f     |                   |
-| 4         | bias          | float | 1.f       |                   |
+| param id | name        | type  | default | description |
+| -------- | ----------- | ----- | ------- | ----------- |
+| 0        | region_type | int   | 0       |             |
+| 1        | local_size  | int   | 5       |             |
+| 2        | alpha       | float | 1.f     |             |
+| 3        | beta        | float | 0.75f   |             |
+| 4        | bias        | float | 1.f     |             |
 
 Region type:
+
 - 0 = ACROSS_CHANNELS
 - 1 = WITHIN_CHANNEL
 
 # LSTM
+
 Apply a single-layer LSTM to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1233,53 +1297,57 @@ y = lstm(x)
 y0, hidden y1, cell y2 = lstm(x0, hidden x1, cell x2)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | output size of output |
-| 1         | weight_data_size| int | 0         | total size of IFOG weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
-| 3         | hidden_size   | int   | num_output| hidden size       |
+| param id | name             | type | default    | description                           |
+| -------- | ---------------- | ---- | ---------- | ------------------------------------- |
+| 0        | num_output       | int  | 0          | output size of output                 |
+| 1        | weight_data_size | int  | 0          | total size of IFOG weight matrix      |
+| 2        | direction        | int  | 0          | 0=forward, 1=reverse, 2=bidirectional |
+| 3        | hidden_size      | int  | num_output | hidden size                           |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, hidden_size * 4, num_directions] |
-| bias_c_data   | float/fp16/int8 | [hidden_size, 4, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, hidden_size * 4, num_directions] |
-| weight_hr_data| float/fp16/int8 | [hidden_size, num_output, num_directions] |
+| weight         | type            | shape                                         |
+| -------------- | --------------- | --------------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, hidden_size * 4, num_directions] |
+| bias_c_data    | float/fp16/int8 | [hidden_size, 4, num_directions]              |
+| weight_hc_data | float/fp16/int8 | [num_output, hidden_size * 4, num_directions] |
+| weight_hr_data | float/fp16/int8 | [hidden_size, num_output, num_directions]     |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # MemoryData
+
 ```
 y = data
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | 0         |                   |
-| 1         | h             | int   | 0         |                   |
-| 11        | d             | int   | 0         |                   |
-| 2         | c             | int   | 0         |                   |
-| 21        | load_type     | int   | 1         | 1=fp32            |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | w         | int  | 0       |             |
+| 1        | h         | int  | 0       |             |
+| 11       | d         | int  | 0       |             |
+| 2        | c         | int  | 0       |             |
+| 21       | load_type | int  | 1       | 1=fp32      |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| data          | float | [w, h, d, c]          |
+| weight | type  | shape        |
+| ------ | ----- | ------------ |
+| data   | float | [w, h, d, c] |
 
 # Mish
+
 ```
 y = x * tanh(log(exp(x) + 1))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # MultiHeadAttention
+
 ```
 split q k v into num_head part q0, k0, v0, q1, k1, v1 ...
 for each num_head part
@@ -1294,33 +1362,34 @@ for each num_head part
 y = affine(out)
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | embed_dim     | int   | 0         |                   |
-| 1         | num_heads     | int   | 1         |                   |
-| 2         | weight_data_size| int | 0         | qdim = weight_data_size / embed_dim |
-| 3         | kdim          | int   | embed_dim |                   |
-| 4         | vdim          | int   | embed_dim |                   |
-| 5         | attn_mask     | int   | 0         |                   |
-| 6         | scale         | float | 1.f / sqrt(embed_dim / num_heads) | |
-| 18        | int8_scale_term | int | 0         |                   |
-
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| q_weight_data | float/fp16/int8 | [embed_dim * qdim] |
-| q_bias_data   | float | [embed_dim]           |
-| k_weight_data | float/fp16/int8 | [embed_dim * kdim] |
-| k_bias_data   | float | [embed_dim]           |
-| v_weight_data | float/fp16/int8 | [embed_dim * vdim] |
-| v_bias_data   | float | [embed_dim]           |
-| out_weight_data| float/fp16/int8 | [qdim * embed_dim] |
-| out_bias_data | float | [qdim]                |
-| q_weight_data_int8_scales| float | [embed_dim] |
-| k_weight_data_int8_scales| float | [embed_dim] |
-| v_weight_data_int8_scales| float | [embed_dim] |
-| out_weight_data_int8_scales| float | [1]      |
+| param id | name             | type  | default                           | description                         |
+| -------- | ---------------- | ----- | --------------------------------- | ----------------------------------- |
+| 0        | embed_dim        | int   | 0                                 |                                     |
+| 1        | num_heads        | int   | 1                                 |                                     |
+| 2        | weight_data_size | int   | 0                                 | qdim = weight_data_size / embed_dim |
+| 3        | kdim             | int   | embed_dim                         |                                     |
+| 4        | vdim             | int   | embed_dim                         |                                     |
+| 5        | attn_mask        | int   | 0                                 |                                     |
+| 6        | scale            | float | 1.f / sqrt(embed_dim / num_heads) |                                     |
+| 18       | int8_scale_term  | int   | 0                                 |                                     |
+
+| weight                      | type            | shape              |
+| --------------------------- | --------------- | ------------------ |
+| q_weight_data               | float/fp16/int8 | [embed_dim * qdim] |
+| q_bias_data                 | float           | [embed_dim]        |
+| k_weight_data               | float/fp16/int8 | [embed_dim * kdim] |
+| k_bias_data                 | float           | [embed_dim]        |
+| v_weight_data               | float/fp16/int8 | [embed_dim * vdim] |
+| v_bias_data                 | float           | [embed_dim]        |
+| out_weight_data             | float/fp16/int8 | [qdim * embed_dim] |
+| out_bias_data               | float           | [qdim]             |
+| q_weight_data_int8_scales   | float           | [embed_dim]        |
+| k_weight_data_int8_scales   | float           | [embed_dim]        |
+| v_weight_data_int8_scales   | float           | [embed_dim]        |
+| out_weight_data_int8_scales | float           | [1]                |
 
 # MVN
+
 ```
 if normalize_variance == 1 && across_channels == 1      y = (x - mean) / (sqrt(var) + eps) of whole blob
 if normalize_variance == 1 && across_channels == 0      y = (x - mean) / (sqrt(var) + eps) of each channel
@@ -1328,20 +1397,22 @@ if normalize_variance == 0 && across_channels == 1      y = x - mean of whole bl
 if normalize_variance == 0 && across_channels == 0      y = x - mean of each channel
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | normalize_variance| int | 0       |                   |
-| 1         | across_channels| int  | 0         |                   |
-| 2         | eps           | float | 0.0001f   | x = x / (sqrt(var) + eps) |
+| param id | name               | type  | default | description               |
+| -------- | ------------------ | ----- | ------- | ------------------------- |
+| 0        | normalize_variance | int   | 0       |                           |
+| 1        | across_channels    | int   | 0       |                           |
+| 2        | eps                | float | 0.0001f | x = x / (sqrt(var) + eps) |
 
 # Noop
+
 ```
 y = x
 ```
 
 # Normalize
+
 ```
 if across_spatial == 1 && across_channel == 1      x2 = normalize(x) of whole blob
 if across_spatial == 1 && across_channel == 0      x2 = normalize(x) of each channel
@@ -1349,79 +1420,85 @@ if across_spatial == 0 && across_channel == 1      x2 = normalize(x) of each pos
 y = x2 * scale
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | across_spatial| int   | 0         |                   |
-| 1         | channel_shared| int   | 0         |                   |
-| 2         | eps           | float | 0.0001f   | see eps mode      |
-| 3         | scale_data_size| int  | 0         |                   |
-| 4         | across_channel| int   | 0         |                   |
-| 9         | eps_mode      | int   | 0         |                   |
+| param id | name            | type  | default | description  |
+| -------- | --------------- | ----- | ------- | ------------ |
+| 0        | across_spatial  | int   | 0       |              |
+| 1        | channel_shared  | int   | 0       |              |
+| 2        | eps             | float | 0.0001f | see eps mode |
+| 3        | scale_data_size | int   | 0       |              |
+| 4        | across_channel  | int   | 0       |              |
+| 9        | eps_mode        | int   | 0       |              |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
 
 Eps Mode:
-- 0 = caffe/mxnet   x = x / sqrt(var + eps)
-- 1 = pytorch       x = x / max(sqrt(var), eps)
-- 2 = tensorflow    x = x / sqrt(max(var, eps))
+
+- 0 = caffe/mxnet x = x / sqrt(var + eps)
+- 1 = pytorch x = x / max(sqrt(var), eps)
+- 2 = tensorflow x = x / sqrt(max(var, eps))
 
 # Packing
+
 ```
 y = wrap_packing(x)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | out_elempack  | int   | 1         |                   |
-| 1         | use_padding   | int   | 0         |                   |
-| 2         | cast_type_from| int   | 0         |                   |
-| 3         | cast_type_to  | int   | 0         |                   |
-| 4         | storage_type_from| int | 0        |                   |
-| 5         | storage_type_to| int  | 0         |                   |
+| param id | name              | type | default | description |
+| -------- | ----------------- | ---- | ------- | ----------- |
+| 0        | out_elempack      | int  | 1       |             |
+| 1        | use_padding       | int  | 0       |             |
+| 2        | cast_type_from    | int  | 0       |             |
+| 3        | cast_type_to      | int  | 0       |             |
+| 4        | storage_type_from | int  | 0       |             |
+| 5        | storage_type_to   | int  | 0       |             |
 
 # Padding
+
 ```
 y = pad(x, pads)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | top           | int  | 0         |                   |
-| 1         | bottom        | int  | 0         |                   |
-| 2         | left          | int  | 0         |                   |
-| 3         | right         | int  | 0         |                   |
-| 4         | type          | int  | 0         |                   |
-| 5         | value         | float | 0         |                   |
-| 6         | per_channel_pad_data_size| int | 0 |                 |
-| 7         | front         | int  | stride_w  |                   |
-| 8         | behind        | int  | pad_left  |                   |
+| param id | name                      | type  | default  | description |
+| -------- | ------------------------- | ----- | -------- | ----------- |
+| 0        | top                       | int   | 0        |             |
+| 1        | bottom                    | int   | 0        |             |
+| 2        | left                      | int   | 0        |             |
+| 3        | right                     | int   | 0        |             |
+| 4        | type                      | int   | 0        |             |
+| 5        | value                     | float | 0        |             |
+| 6        | per_channel_pad_data_size | int   | 0        |             |
+| 7        | front                     | int   | stride_w |             |
+| 8        | behind                    | int   | pad_left |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| per_channel_pad_data| float | [per_channel_pad_data_size] |
+| weight               | type  | shape                       |
+| -------------------- | ----- | --------------------------- |
+| per_channel_pad_data | float | [per_channel_pad_data_size] |
 
 Padding type:
+
 - 0 = CONSTANT
 - 1 = REPLICATE
 - 2 = REFLECT
 
 # Permute
+
 ```
 y = reorder(x)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | order_type    | int  | 0         |                   |
+| param id | name       | type | default | description |
+| -------- | ---------- | ---- | ------- | ----------- |
+| 0        | order_type | int  | 0       |             |
 
 Order Type:
+
 - 0 = WH WHC WHDC
 - 1 = HW HWC HWDC
 - 2 = WCH WDHC
@@ -1448,183 +1525,198 @@ Order Type:
 - 23 = CDHW
 
 # PixelShuffle
+
 ```
 if mode == 0    y = depth_to_space(x) where x channel order is sw-sh-outc
 if mode == 1    y = depth_to_space(x) where x channel order is outc-sw-sh
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | upscale_factor| int  | 1         |                   |
-| 1         | mode          | int  | 0         |                   |
+| param id | name           | type | default | description |
+| -------- | -------------- | ---- | ------- | ----------- |
+| 0        | upscale_factor | int  | 1       |             |
+| 1        | mode           | int  | 0       |             |
 
 # Pooling
+
 ```
 x2 = pad(x, pads)
 x3 = pooling(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 11        | kernel_h      | int  | kernel_w  |                   |
-| 12        | stride_h      | int  | stride_w  |                   |
-| 13        | pad_top       | int  | pad_left  |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
-| 15        | pad_bottom    | int  | pad_top   |                   |
-| 18        | out_h         | int  | out_w     |                   |
+| param id | name                      | type | default  | description |
+| -------- | ------------------------- | ---- | -------- | ----------- |
+| 0        | pooling_type              | int  | 0        |             |
+| 1        | kernel_w                  | int  | 0        |             |
+| 2        | stride_w                  | int  | 1        |             |
+| 3        | pad_left                  | int  | 0        |             |
+| 4        | global_pooling            | int  | 0        |             |
+| 5        | pad_mode                  | int  | 0        |             |
+| 6        | avgpool_count_include_pad | int  | 0        |             |
+| 7        | adaptive_pooling          | int  | 0        |             |
+| 8        | out_w                     | int  | 0        |             |
+| 11       | kernel_h                  | int  | kernel_w |             |
+| 12       | stride_h                  | int  | stride_w |             |
+| 13       | pad_top                   | int  | pad_left |             |
+| 14       | pad_right                 | int  | pad_left |             |
+| 15       | pad_bottom                | int  | pad_top  |             |
+| 18       | out_h                     | int  | out_w    |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Pooling1D
+
 ```
 x2 = pad(x, pads)
 x3 = pooling1d(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
+| param id | name                      | type | default  | description |
+| -------- | ------------------------- | ---- | -------- | ----------- |
+| 0        | pooling_type              | int  | 0        |             |
+| 1        | kernel_w                  | int  | 0        |             |
+| 2        | stride_w                  | int  | 1        |             |
+| 3        | pad_left                  | int  | 0        |             |
+| 4        | global_pooling            | int  | 0        |             |
+| 5        | pad_mode                  | int  | 0        |             |
+| 6        | avgpool_count_include_pad | int  | 0        |             |
+| 7        | adaptive_pooling          | int  | 0        |             |
+| 8        | out_w                     | int  | 0        |             |
+| 14       | pad_right                 | int  | pad_left |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Pooling3D
+
 ```
 x2 = pad(x, pads)
 x3 = pooling3d(x2, kernel, stride)
 ```
 
-| param id  | name          | type | default   | description       |
-| --------- | --------------| ---- | --------- | ----------------- |
-| 0         | pooling_type  | int  | 0         |                   |
-| 1         | kernel_w      | int  | 0         |                   |
-| 2         | stride_w      | int  | 1         |                   |
-| 3         | pad_left      | int  | 0         |                   |
-| 4         | global_pooling| int  | 0         |                   |
-| 5         | pad_mode      | int  | 0         |                   |
-| 6         | avgpool_count_include_pad| int | 0 |                 |
-| 7         | adaptive_pooling| int | 0        |                   |
-| 8         | out_w         | int  | 0         |                   |
-| 11        | kernel_h      | int  | kernel_w  |                   |
-| 12        | stride_h      | int  | stride_w  |                   |
-| 13        | pad_top       | int  | pad_left  |                   |
-| 14        | pad_right     | int  | pad_left  |                   |
-| 15        | pad_bottom    | int  | pad_top   |                   |
-| 16        | pad_behind    | int  | pad_front |                   |
-| 18        | out_h         | int  | out_w     |                   |
-| 21        | kernel_d      | int  | kernel_w  |                   |
-| 22        | stride_d      | int  | stride_w  |                   |
-| 23        | pad_front     | int  | pad_left  |                   |
-| 28        | out_d         | int  | out_w     |                   |
+| param id | name                      | type | default   | description |
+| -------- | ------------------------- | ---- | --------- | ----------- |
+| 0        | pooling_type              | int  | 0         |             |
+| 1        | kernel_w                  | int  | 0         |             |
+| 2        | stride_w                  | int  | 1         |             |
+| 3        | pad_left                  | int  | 0         |             |
+| 4        | global_pooling            | int  | 0         |             |
+| 5        | pad_mode                  | int  | 0         |             |
+| 6        | avgpool_count_include_pad | int  | 0         |             |
+| 7        | adaptive_pooling          | int  | 0         |             |
+| 8        | out_w                     | int  | 0         |             |
+| 11       | kernel_h                  | int  | kernel_w  |             |
+| 12       | stride_h                  | int  | stride_w  |             |
+| 13       | pad_top                   | int  | pad_left  |             |
+| 14       | pad_right                 | int  | pad_left  |             |
+| 15       | pad_bottom                | int  | pad_top   |             |
+| 16       | pad_behind                | int  | pad_front |             |
+| 18       | out_h                     | int  | out_w     |             |
+| 21       | kernel_d                  | int  | kernel_w  |             |
+| 22       | stride_d                  | int  | stride_w  |             |
+| 23       | pad_front                 | int  | pad_left  |             |
+| 28       | out_d                     | int  | out_w     |             |
 
 Pooling type:
+
 - 0 = MAX
 - 1 = AVG
 
 Pad mode:
+
 - 0 = full padding
 - 1 = valid padding
 - 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
 - 3 = onnx padding=SAME_LOWER
 
 # Power
+
 ```
 y = pow((shift + x * scale), power)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | power         | float | 1.f       |                   |
-| 1         | scale         | float | 1.f       |                   |
-| 2         | shift         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | power | float | 1.f     |             |
+| 1        | scale | float | 1.f     |             |
+| 2        | shift | float | 0.f     |             |
 
 # PReLU
+
 ```
 if x < 0    y = x * slope
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_slope     | int   | 0         |                   |
+| param id | name      | type | default | description |
+| -------- | --------- | ---- | ------- | ----------- |
+| 0        | num_slope | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| slope_data    | float | [num_slope]           |
+| weight     | type  | shape       |
+| ---------- | ----- | ----------- |
+| slope_data | float | [num_slope] |
 
 # Quantize
+
 ```
 y = float2int8(x * scale)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 1         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 1       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
 
 # Reduction
+
 ```
 y = reduce_op(x * coeff)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | operation     | int   | 0         |                   |
-| 1         | reduce_all    | int   | 1         |                   |
-| 2         | coeff         | float | 1.f       |                   |
-| 3         | axes          | array | [ ]       |                   |
-| 4         | keepdims      | int   | 0         |                   |
-| 5         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |
+| param id | name       | type  | default | description                   |
+| -------- | ---------- | ----- | ------- | ----------------------------- |
+| 0        | operation  | int   | 0       |                               |
+| 1        | reduce_all | int   | 1       |                               |
+| 2        | coeff      | float | 1.f     |                               |
+| 3        | axes       | array | [ ]     |                               |
+| 4        | keepdims   | int   | 0       |                               |
+| 5        | fixbug0    | int   | 0       | hack for bug fix, should be 1 |
 
 Operation type:
+
 - 0 = SUM
 - 1 = ASUM
 - 2 = SUMSQ
@@ -1638,96 +1730,103 @@ Operation type:
 - 10 = LogSumExp
 
 # ReLU
+
 ```
 if x < 0    y = x * slope
 else        y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | slope         | float | 0.f       |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | slope | float | 0.f     |             |
 
 # Reorg
+
 ```
 if mode == 0    y = space_to_depth(x) where x channel order is sw-sh-outc
 if mode == 1    y = space_to_depth(x) where x channel order is outc-sw-sh
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | stride        | int  | 1         |                   |
-| 1         | mode          | int  | 0         |                   |
+| param id | name   | type | default | description |
+| -------- | ------ | ---- | ------- | ----------- |
+| 0        | stride | int  | 1       |             |
+| 1        | mode   | int  | 0       |             |
 
 # Requantize
+
 ```
 x2 = x * scale_in + bias
 x3 = activation(x2)
 y = float2int8(x3 * scale_out)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_in_data_size| int | 1       |                   |
-| 1         | scale_out_data_size| int | 1      |                   |
-| 2         | bias_data_size| int   | 0         |                   |
-| 3         | activation_type| int  | 0         |                   |
-| 4         | activation_params| int | [ ]      |                   |
+| param id | name                | type | default | description |
+| -------- | ------------------- | ---- | ------- | ----------- |
+| 0        | scale_in_data_size  | int  | 1       |             |
+| 1        | scale_out_data_size | int  | 1       |             |
+| 2        | bias_data_size      | int  | 0       |             |
+| 3        | activation_type     | int  | 0       |             |
+| 4        | activation_params   | int  | [ ]     |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_in_data | float | [scale_in_data_size]  |
-| scale_out_data| float | [scale_out_data_size] |
-| bias_data     | float | [bias_data_size]      |
+| weight         | type  | shape                 |
+| -------------- | ----- | --------------------- |
+| scale_in_data  | float | [scale_in_data_size]  |
+| scale_out_data | float | [scale_out_data_size] |
+| bias_data      | float | [bias_data_size]      |
 
 # Reshape
+
 ```
 if permute == 1     y = hwc2chw(reshape(chw2hwc(x)))
 else                y = reshape(x)
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | w             | int   | -233      |                   |
-| 1         | h             | int   | -233      |                   |
-| 11        | d             | int   | -233      |                   |
-| 2         | c             | int   | -233      |                   |
-| 3         | permute       | int   | 0         |                   |
+| param id | name    | type | default | description |
+| -------- | ------- | ---- | ------- | ----------- |
+| 0        | w       | int  | -233    |             |
+| 1        | h       | int  | -233    |             |
+| 11       | d       | int  | -233    |             |
+| 2        | c       | int  | -233    |             |
+| 3        | permute | int  | 0       |             |
 
 Reshape flag:
+
 - 0 = copy from bottom
 - -1 = remaining
 - -233 = drop this dim(default)
 
 # RMSNorm
+
 ```
 split x along outmost axis into part x0, x1 ...
 root mean square normalize for each part x0, x1 ...
 y = x * gamma by elementwise
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | affine_size   | int   | 0         |                   |
-| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
-| 2         | affine        | int   | 1         |                   |
+| param id | name        | type  | default | description             |
+| -------- | ----------- | ----- | ------- | ----------------------- |
+| 0        | affine_size | int   | 0       |                         |
+| 1        | eps         | float | 0.001f  | x = x / sqrt(var + eps) |
+| 2        | affine      | int   | 1       |                         |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| gamma_data    | float | [affine_size]         |
+| weight     | type  | shape         |
+| ---------- | ----- | ------------- |
+| gamma_data | float | [affine_size] |
 
 # RNN
+
 Apply a single-layer RNN to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
 ```
@@ -1735,127 +1834,137 @@ y = rnn(x)
 y0, hidden y1 = rnn(x0, hidden x1)
 ```
 
-* one_blob_only if bidirectional
+- one_blob_only if bidirectional
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         | hidden size of output |
-| 1         | weight_data_size| int | 0         | total size of weight matrix |
-| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
+| param id | name             | type | default | description                           |
+| -------- | ---------------- | ---- | ------- | ------------------------------------- |
+| 0        | num_output       | int  | 0       | hidden size of output                 |
+| 1        | weight_data_size | int  | 0       | total size of weight matrix           |
+| 2        | direction        | int  | 0       | 0=forward, 1=reverse, 2=bidirectional |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| weight_xc_data| float/fp16/int8 | [input_size, num_output, num_directions] |
-| bias_c_data   | float/fp16/int8 | [num_output, 1, num_directions] |
-| weight_hc_data| float/fp16/int8 | [num_output, num_output, num_directions] |
+| weight         | type            | shape                                    |
+| -------------- | --------------- | ---------------------------------------- |
+| weight_xc_data | float/fp16/int8 | [input_size, num_output, num_directions] |
+| bias_c_data    | float/fp16/int8 | [num_output, 1, num_directions]          |
+| weight_hc_data | float/fp16/int8 | [num_output, num_output, num_directions] |
 
 Direction flag:
+
 - 0 = forward only
 - 1 = reverse only
 - 2 = bidirectional
 
 # Scale
+
 ```
 if scale_data_size == -233  y = x0 * x1
 else                        y = x * scale + bias
 ```
 
-* one_blob_only if scale_data_size != -233
-* support_inplace
+- one_blob_only if scale_data_size != -233
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | scale_data_size| int  | 0         |                   |
-| 1         | bias_term     | int   | 0         |                   |
+| param id | name            | type | default | description |
+| -------- | --------------- | ---- | ------- | ----------- |
+| 0        | scale_data_size | int  | 0       |             |
+| 1        | bias_term       | int  | 0       |             |
 
-| weight        | type  | shape                 |
-| ------------- | ----- | --------------------- |
-| scale_data    | float | [scale_data_size]     |
-| bias_data     | float | [scale_data_size]     |
+| weight     | type  | shape             |
+| ---------- | ----- | ----------------- |
+| scale_data | float | [scale_data_size] |
+| bias_data  | float | [scale_data_size] |
 
 # SELU
+
 ```
 if x < 0    y = (exp(x) - 1.f) * alpha * lambda
 else        y = x * lambda
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | alpha         | float | 1.67326324f|                  |
-| 1         | lambda        | float | 1.050700987f|                 |
+| param id | name   | type  | default      | description |
+| -------- | ------ | ----- | ------------ | ----------- |
+| 0        | alpha  | float | 1.67326324f  |             |
+| 1        | lambda | float | 1.050700987f |             |
 
 # Shrink
+
 ```
 if x < -lambd y = x + bias
 if x >  lambd y = x - bias
 else          y = x
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | bias          | float | 0.0f      |                   |
-| 1         | lambd         | float | 0.5f      |                   |
+| param id | name  | type  | default | description |
+| -------- | ----- | ----- | ------- | ----------- |
+| 0        | bias  | float | 0.0f    |             |
+| 1        | lambd | float | 0.5f    |             |
 
 # ShuffleChannel
+
 ```
 if reverse == 0     y = shufflechannel(x) by group
 if reverse == 1     y = shufflechannel(x) by channel / group
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type | default   | description       |
-| --------- | ------------- | ---- | --------- | ----------------- |
-| 0         | group         | int  | 1         |                   |
-| 1         | reverse       | int  | 0         |                   |
+| param id | name    | type | default | description |
+| -------- | ------- | ---- | ------- | ----------- |
+| 0        | group   | int  | 1       |             |
+| 1        | reverse | int  | 0       |             |
 
 # Sigmoid
+
 ```
 y = 1 / (1 + exp(-x))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Slice
+
 ```
 split x along axis into slices, each part slice size is based on slices array
 ```
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | slices        | array | [ ]       |                   |
-| 1         | axis          | int   | 0         |                   |
-| 2         | indices       | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | slices  | array | [ ]     |             |
+| 1        | axis    | int   | 0       |             |
+| 2        | indices | array | [ ]     |             |
 
 # Softmax
+
 ```
 softmax(x, axis)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
-| 1         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |
+| param id | name    | type | default | description                   |
+| -------- | ------- | ---- | ------- | ----------------------------- |
+| 0        | axis    | int  | 0       |                               |
+| 1        | fixbug0 | int  | 0       | hack for bug fix, should be 1 |
 
 # Softplus
+
 ```
 y = log(exp(x) + 1)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Spectrogram
+
 ```
 x1 = pad(x) if center
 y = stft(x1)
@@ -1866,68 +1975,74 @@ if power == 1 return magnitude
 if power == 2 return square of magnitude
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | n_fft         | int   | 0         |                   |
-| 1         | power         | int   | 0         |                   |
-| 2         | hoplen        | int   | n_fft / 4 |                   |
-| 3         | winlen        | int   | n_fft     |                   |
-| 4         | window_type   | int   | 0         | 0=ones 1=hann 2=hamming |
-| 5         | center        | int   | 1         |                   |
-| 6         | pad_type      | int   | 2         | 0=CONSTANT 1=REPLICATE 2=REFLECT |
-| 7         | normalized    | int   | 0         | 0=no 1=n_fft 2=window-l2-energy |
-| 8         | onesided      | int   | 1         |                   |
+| param id | name        | type | default   | description                      |
+| -------- | ----------- | ---- | --------- | -------------------------------- |
+| 0        | n_fft       | int  | 0         |                                  |
+| 1        | power       | int  | 0         |                                  |
+| 2        | hoplen      | int  | n_fft / 4 |                                  |
+| 3        | winlen      | int  | n_fft     |                                  |
+| 4        | window_type | int  | 0         | 0=ones 1=hann 2=hamming          |
+| 5        | center      | int  | 1         |                                  |
+| 6        | pad_type    | int  | 2         | 0=CONSTANT 1=REPLICATE 2=REFLECT |
+| 7        | normalized  | int  | 0         | 0=no 1=n_fft 2=window-l2-energy  |
+| 8        | onesided    | int  | 1         |                                  |
 
 # Split
+
 ```
 y0, y1 ... = x
 ```
 
 # Swish
+
 ```
 y = x / (1 + exp(-x))
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # TanH
+
 ```
 y = tanh(x)
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
 # Threshold
+
 ```
 if x > threshold    y = 1
 else                y = 0
 ```
 
-* one_blob_only
-* support_inplace
+- one_blob_only
+- support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | threshold     | float | 0.f       |                   |
+| param id | name      | type  | default | description |
+| -------- | --------- | ----- | ------- | ----------- |
+| 0        | threshold | float | 0.f     |             |
 
 # Tile
+
 ```
 y = repeat tiles along axis for x
 ```
 
-* one_blob_only
+- one_blob_only
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | axis          | int   | 0         |                   |
-| 1         | tiles         | int   | 1         |                   |
-| 2         | repeats       | array | [ ]       |                   |
+| param id | name    | type  | default | description |
+| -------- | ------- | ----- | ------- | ----------- |
+| 0        | axis    | int   | 0       |             |
+| 1        | tiles   | int   | 1       |             |
+| 2        | repeats | array | [ ]     |             |
 
 # UnaryOp
+
 ```
 y = unaryop(x)
 ```
@@ -1935,11 +2050,12 @@ y = unaryop(x)
 - one_blob_only
 - support_inplace
 
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | op_type       | int   | 0         | Operation type as follows |
+| param id | name    | type | default | description               |
+| -------- | ------- | ---- | ------- | ------------------------- |
+| 0        | op_type | int  | 0       | Operation type as follows |
 
 Operation type:
+
 - 0 = ABS
 - 1 = NEG
 - 2 = FLOOR
@@ -1962,22 +2078,23 @@ Operation type:
 - 19 = TRUNC
 
 # Unfold
+
 ```
 y = unfold(x)
 ```
 
-* one_blob_only
-
-| param id  | name          | type  | default   | description       |
-| --------- | ------------- | ----- | --------- | ----------------- |
-| 0         | num_output    | int   | 0         |                   |
-| 1         | kernel_w      | int   | 0         |                   |
-| 2         | dilation_w    | int   | 1         |                   |
-| 3         | stride_w      | int   | 1         |                   |
-| 4         | pad_left      | int   | 0         |                   |
-| 11        | kernel_h      | int   | kernel_w  |                   |
-| 12        | dilation_h    | int   | dilation_w |                  |
-| 13        | stride_h      | int   | stride_w  |                   |
-| 14        | pad_top       | int   | pad_left  |                   |
-| 15        | pad_right     | int   | pad_left  |                   |
-| 16        | pad_bottom    | int   | pad_top   |                   |
+- one_blob_only
+
+| param id | name       | type | default    | description |
+| -------- | ---------- | ---- | ---------- | ----------- |
+| 0        | num_output | int  | 0          |             |
+| 1        | kernel_w   | int  | 0          |             |
+| 2        | dilation_w | int  | 1          |             |
+| 3        | stride_w   | int  | 1          |             |
+| 4        | pad_left   | int  | 0          |             |
+| 11       | kernel_h   | int  | kernel_w   |             |
+| 12       | dilation_h | int  | dilation_w |             |
+| 13       | stride_h   | int  | stride_w   |             |
+| 14       | pad_top    | int  | pad_left   |             |
+| 15       | pad_right  | int  | pad_left   |             |
+| 16       | pad_bottom | int  | pad_top    |             |
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index b1ac6f5c0245..cbc31e217ef8 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -575,6 +575,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/torch_cumsum.cpp
     pass_ncnn/torch_diag.cpp
     pass_ncnn/torch_flatten.cpp
+    pass_ncnn/torch_index_select.cpp
     pass_ncnn/torch_istft.cpp
     pass_ncnn/torch_logsumexp.cpp
     pass_ncnn/torch_matmul.cpp
diff --git a/tools/pnnx/src/pass_ncnn/torch_index_select.cpp b/tools/pnnx/src/pass_ncnn/torch_index_select.cpp
new file mode 100644
index 000000000000..8cb0cef548b6
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/torch_index_select.cpp
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class torch_index_select : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 4
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 index
+torch.index_select      op_0        2 1 input index out dim=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "IndexSelect";
+    }
+
+    const char* name_str() const
+    {
+        return "index_select";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        int dim = captured_params.at("dim").i;
+
+        // 设置参数
+        op->params["0"] = dim;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_index_select, 60)
+
+} // namespace ncnn
+
+} // namespace pnnx

From 4dbfd7193ec3fdcce1017225517dea4ee9c9b702 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Thu, 16 Jan 2025 21:10:51 +0800
Subject: [PATCH 2/3] check c++03 and ctest

---
 .github/workflows/linux-x64-cpu-gcc.yml       | 218 +++++++-------
 src/CMakeLists.txt                            |   1 +
 src/layer/indexselect.cpp                     | 270 ++++++++++++++++++
 src/layer/indexselect.h                       |  37 +++
 tests/CMakeLists.txt                          |   1 +
 tests/test_indexselect.cpp                    | 113 ++++++++
 tools/pnnx/tests/ncnn/CMakeLists.txt          |   1 +
 .../tests/ncnn/test_torch_index_select.py     | 149 ++++++++++
 8 files changed, 681 insertions(+), 109 deletions(-)
 create mode 100644 src/layer/indexselect.cpp
 create mode 100644 src/layer/indexselect.h
 create mode 100644 tests/test_indexselect.cpp
 create mode 100644 tools/pnnx/tests/ncnn/test_torch_index_select.py

diff --git a/.github/workflows/linux-x64-cpu-gcc.yml b/.github/workflows/linux-x64-cpu-gcc.yml
index ab2185be3e74..31abbe47c251 100644
--- a/.github/workflows/linux-x64-cpu-gcc.yml
+++ b/.github/workflows/linux-x64-cpu-gcc.yml
@@ -1,33 +1,33 @@
 name: linux-x64-cpu-gcc
 on:
   push:
-    branches: [master]
+    # branches: [master]
     paths:
-    - '.github/workflows/linux-x64-cpu-gcc.yml'
-    - 'toolchains/host-c.gcc.toolchain.cmake'
-    - 'CMakeLists.txt'
-    - 'cmake/**'
-    - 'src/*'
-    - 'src/layer/*'
-    - 'src/layer/x86/**'
-    - 'tests/**'
-    - 'tools/**'
-    - '!tools/pnnx/**'
-    - 'examples/**'
+      - ".github/workflows/linux-x64-cpu-gcc.yml"
+      - "toolchains/host-c.gcc.toolchain.cmake"
+      - "CMakeLists.txt"
+      - "cmake/**"
+      - "src/*"
+      - "src/layer/*"
+      - "src/layer/x86/**"
+      - "tests/**"
+      - "tools/**"
+      - "!tools/pnnx/**"
+      - "examples/**"
   pull_request:
     branches: [master]
     paths:
-    - '.github/workflows/linux-x64-cpu-gcc.yml'
-    - 'toolchains/host-c.gcc.toolchain.cmake'
-    - 'CMakeLists.txt'
-    - 'cmake/**'
-    - 'src/*'
-    - 'src/layer/*'
-    - 'src/layer/x86/**'
-    - 'tests/**'
-    - 'tools/**'
-    - '!tools/pnnx/**'
-    - 'examples/**'
+      - ".github/workflows/linux-x64-cpu-gcc.yml"
+      - "toolchains/host-c.gcc.toolchain.cmake"
+      - "CMakeLists.txt"
+      - "cmake/**"
+      - "src/*"
+      - "src/layer/*"
+      - "src/layer/x86/**"
+      - "tests/**"
+      - "tools/**"
+      - "!tools/pnnx/**"
+      - "examples/**"
 concurrency:
   group: linux-x64-cpu-gcc-${{ github.ref }}
   cancel-in-progress: true
@@ -38,97 +38,97 @@ jobs:
   linux-gcc:
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v4
-    - name: update
-      run: sudo apt-get update
-    - name: protobuf
-      run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
-    - name: build-sse2
-      run: |
-        mkdir build-sse2 && cd build-sse2
-        cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
-    - name: test-sse2
-      run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
-    - name: build-shared
-      run: |
-        mkdir build-shared && cd build-shared
-        cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-    - name: build-avx2
-      run: |
-        mkdir build-avx2 && cd build-avx2
-        cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
-    - name: test-avx2
-      run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
-    - name: build-avx
-      run: |
-        mkdir build-avx && cd build-avx
-        cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
-    - name: test-avx
-      run: cd build-avx && ctest --output-on-failure -j $(nproc)
-    - name: build-avx1-2
-      run: |
-        mkdir build-avx1-2 && cd build-avx1-2
-        cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
-    - name: test-avx1-2
-      run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
-    - name: build-noint8
-      run: |
-        mkdir build-noint8 && cd build-noint8
-        cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j $(nproc)
-    - name: test-noint8
-      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
+      - uses: actions/checkout@v4
+      - name: update
+        run: sudo apt-get update
+      - name: protobuf
+        run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
+      - name: build-sse2
+        run: |
+          mkdir build-sse2 && cd build-sse2
+          cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
+          cmake --build . -j $(nproc)
+      - name: test-sse2
+        run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
+      - name: build-shared
+        run: |
+          mkdir build-shared && cd build-shared
+          cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
+          cmake --build . -j $(nproc)
+      - name: build-avx2
+        run: |
+          mkdir build-avx2 && cd build-avx2
+          cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
+          cmake --build . -j $(nproc)
+      - name: test-avx2
+        run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
+      - name: build-avx
+        run: |
+          mkdir build-avx && cd build-avx
+          cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
+          cmake --build . -j $(nproc)
+      - name: test-avx
+        run: cd build-avx && ctest --output-on-failure -j $(nproc)
+      - name: build-avx1-2
+        run: |
+          mkdir build-avx1-2 && cd build-avx1-2
+          cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
+          cmake --build . -j $(nproc)
+      - name: test-avx1-2
+        run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
+      - name: build-noint8
+        run: |
+          mkdir build-noint8 && cd build-noint8
+          cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
+          cmake --build . -j $(nproc)
+      - name: test-noint8
+        run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
 
   linux-gcc-cpp03-nostdio-nostring-simplestl:
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v4
-    - name: build-nostdio
-      run: |
-        mkdir build-nostdio && cd build-nostdio
-        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j $(nproc)
-    - name: test-nostdio
-      run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
-    - name: build-nostdio-nostring
-      run: |
-        mkdir build-nostdio-nostring && cd build-nostdio-nostring
-        cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j $(nproc)
-    - name: build-simplestl
-      run: |
-        mkdir build-simplestl && cd build-simplestl
-        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j $(nproc)
-    - name: test-simplestl
-      run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
-    - name: build-simplestl-simpleomp
-      run: |
-        mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
-        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j $(nproc)
-    - name: test-simplestl-simpleomp
-      run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
+      - uses: actions/checkout@v4
+      - name: build-nostdio
+        run: |
+          mkdir build-nostdio && cd build-nostdio
+          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+          cmake --build . -j $(nproc)
+      - name: test-nostdio
+        run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
+      - name: build-nostdio-nostring
+        run: |
+          mkdir build-nostdio-nostring && cd build-nostdio-nostring
+          cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+          cmake --build . -j $(nproc)
+      - name: build-simplestl
+        run: |
+          mkdir build-simplestl && cd build-simplestl
+          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+          cmake --build . -j $(nproc)
+      - name: test-simplestl
+        run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
+      - name: build-simplestl-simpleomp
+        run: |
+          mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
+          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+          cmake --build . -j $(nproc)
+      - name: test-simplestl-simpleomp
+        run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
 
   linux-gcc-avx512:
     runs-on: [self-hosted, linux, t4]
     steps:
-    - uses: actions/checkout@v4
-    - name: build
-      env:
-        CC: gcc
-        CXX: g++
-        LD_LIBRARY_PATH: /data/action/install/lib64
-      run: |
-        mkdir build && cd build
-        cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 4
-    - name: test
-      env:
-        LD_LIBRARY_PATH: /data/action/install/lib64
-      run: cd build && ctest --output-on-failure -j 4
+      - uses: actions/checkout@v4
+      - name: build
+        env:
+          CC: gcc
+          CXX: g++
+          LD_LIBRARY_PATH: /data/action/install/lib64
+        run: |
+          mkdir build && cd build
+          cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+          cmake --build . -j 4
+      - name: test
+        env:
+          LD_LIBRARY_PATH: /data/action/install/lib64
+        run: cd build && ctest --output-on-failure -j 4
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c97235d97a00..33f9db2a630f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -169,6 +169,7 @@ ncnn_add_layer(Shrink)
 ncnn_add_layer(RMSNorm)
 ncnn_add_layer(Spectrogram)
 ncnn_add_layer(InverseSpectrogram)
+ncnn_add_layer(IndexSelect)
 
 if(NCNN_VULKAN)
     ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
diff --git a/src/layer/indexselect.cpp b/src/layer/indexselect.cpp
new file mode 100644
index 000000000000..45dfc63b791b
--- /dev/null
+++ b/src/layer/indexselect.cpp
@@ -0,0 +1,270 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "indexselect.h"
+
+namespace ncnn {
+IndexSelect::IndexSelect()
+{
+    one_blob_only = false;   // 是否单一输入
+    support_inplace = false; // 是否支持原地运算
+}
+
+int IndexSelect::load_param(const ParamDict& pd)
+{
+    dim = pd.get(0, -1); // dim = [-dim~dim-1]
+    return 0;
+}
+
+int IndexSelect::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& index_blob = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0]; // 仅1个输出
+    int dims = bottom_blob.dims;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+
+    int index_len = index_blob.w; // 索引数据
+
+    int axis = dim < 0 ? dim + dims : dim;
+    // 检查k值是否有效
+    if (index_len < 1 || axis >= dims)
+    {
+        return -1;
+    }
+
+    if (dims == 1)
+    {
+        // 创建输出blob
+        top_blob.create(index_len, elemsize, opt.blob_allocator);
+        const float* ptr = bottom_blob;
+        float* outptr = top_blob;
+        const int* index_ptr = index_blob;
+        for (int i = 0; i < index_len; i++)
+        {
+            outptr[i] = ptr[index_ptr[i]];
+        }
+    }
+    else if (dims == 2)
+    {
+        if (axis == 0)
+        {
+            top_blob.create(w, index_len, elemsize, opt.blob_allocator);
+            const int* index_ptr = index_blob;
+
+            for (int i = 0; i < index_len; i++)
+            {
+                int index = index_ptr[i];
+                const float* ptr_row = bottom_blob.row(index);
+                float* outptr_row = top_blob.row(i);
+                memcpy(outptr_row, ptr_row, w * sizeof(float));
+            }
+        }
+        else if (axis == 1)
+        {
+            top_blob.create(index_len, h, elemsize, opt.blob_allocator);
+            const int* index_ptr = index_blob;
+            for (int i = 0; i < h; i++)
+            {
+                const float* ptr_row = bottom_blob.row(i);
+                float* outptr_row = top_blob.row(i);
+
+                // 对每一行,根据索引选择对应列
+                for (int j = 0; j < index_len; j++)
+                {
+                    int index = index_ptr[j];
+                    outptr_row[j] = ptr_row[index];
+                }
+            }
+        }
+    }
+    else if (dims == 3)
+    {
+        if (axis == 0) // channels维度
+        {
+            top_blob.create(w, h, index_len, elemsize, opt.blob_allocator);
+            const int* index_ptr = index_blob;
+
+            for (int q = 0; q < index_len; q++)
+            {
+                int index = index_ptr[q];
+                const Mat bottom_channel = bottom_blob.channel(index);
+                Mat top_channel = top_blob.channel(q);
+
+                for (int i = 0; i < h; i++)
+                {
+                    const float* ptr = bottom_channel.row(i);
+                    float* outptr = top_channel.row(i);
+                    memcpy(outptr, ptr, w * sizeof(float));
+                }
+            }
+        }
+        else if (axis == 1) // h维度
+        {
+            top_blob.create(w, index_len, channels, elemsize, opt.blob_allocator);
+            const int* index_ptr = index_blob;
+
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat bottom_channel = bottom_blob.channel(q);
+                Mat top_channel = top_blob.channel(q);
+
+                for (int i = 0; i < index_len; i++)
+                {
+                    int index = index_ptr[i];
+                    const float* ptr = bottom_channel.row(index);
+                    float* outptr = top_channel.row(i);
+                    memcpy(outptr, ptr, w * sizeof(float));
+                }
+            }
+        }
+        else if (axis == 2) // w维度
+        {
+            top_blob.create(index_len, h, channels, elemsize, opt.blob_allocator);
+            const int* index_ptr = index_blob;
+
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat bottom_channel = bottom_blob.channel(q);
+                Mat top_channel = top_blob.channel(q);
+
+                for (int i = 0; i < h; i++)
+                {
+                    const float* ptr = bottom_channel.row(i);
+                    float* outptr = top_channel.row(i);
+                    for (int j = 0; j < index_len; j++)
+                    {
+                        int index = index_ptr[j];
+                        outptr[j] = ptr[index];
+                    }
+                }
+            }
+        }
+    }
+
+    else if (dims == 4)
+    {
+        if (axis == 0) // channels维度
+        {
+            top_blob.create(w, h, d, index_len, elemsize, opt.blob_allocator);
+            const int* index_ptr = index_blob;
+
+            for (int q = 0; q < index_len; q++)
+            {
+                int index = index_ptr[q];
+                const Mat bottom_c = bottom_blob.channel(index);
+                Mat top_c = top_blob.channel(q);
+
+                for (int z = 0; z < d; z++)
+                {
+                    const Mat bottom_d = bottom_c.channel(z);
+                    Mat top_d = top_c.channel(z);
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr = bottom_d.row(i);
+                        float* outptr = top_d.row(i);
+                        memcpy(outptr, ptr, w * sizeof(float));
+                    }
+                }
+            }
+        }
+        else if (axis == 1) // d维度
+        {
+            top_blob.create(w, h, index_len, channels, elemsize, opt.blob_allocator);
+            const int* index_ptr = index_blob;
+
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat bottom_c = bottom_blob.channel(q);
+                Mat top_c = top_blob.channel(q);
+
+                for (int z = 0; z < index_len; z++)
+                {
+                    int index = index_ptr[z];
+                    const Mat bottom_d = bottom_c.channel(index);
+                    Mat top_d = top_c.channel(z);
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr = bottom_d.row(i);
+                        float* outptr = top_d.row(i);
+                        memcpy(outptr, ptr, w * sizeof(float));
+                    }
+                }
+            }
+        }
+        else if (axis == 2) // h维度
+        {
+            top_blob.create(w, index_len, d, channels, elemsize, opt.blob_allocator);
+            const int* index_ptr = index_blob;
+
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat bottom_c = bottom_blob.channel(q);
+                Mat top_c = top_blob.channel(q);
+
+                for (int z = 0; z < d; z++)
+                {
+                    const Mat bottom_d = bottom_c.channel(z);
+                    Mat top_d = top_c.channel(z);
+                    for (int i = 0; i < index_len; i++)
+                    {
+                        int index = index_ptr[i];
+                        const float* ptr = bottom_d.row(index);
+                        float* outptr = top_d.row(i);
+                        memcpy(outptr, ptr, w * sizeof(float));
+                    }
+                }
+            }
+        }
+        else if (axis == 3) // w维度
+        {
+            top_blob.create(index_len, h, d, channels, elemsize, opt.blob_allocator);
+            const int* index_ptr = index_blob;
+
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat bottom_c = bottom_blob.channel(q);
+                Mat top_c = top_blob.channel(q);
+
+                for (int z = 0; z < d; z++)
+                {
+                    const Mat bottom_d = bottom_c.channel(z);
+                    Mat top_d = top_c.channel(z);
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr = bottom_d.row(i);
+                        float* outptr = top_d.row(i);
+                        for (int j = 0; j < index_len; j++)
+                        {
+                            int index = index_ptr[j];
+                            outptr[j] = ptr[index];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/indexselect.h b/src/layer/indexselect.h
new file mode 100644
index 000000000000..1a0da5712082
--- /dev/null
+++ b/src/layer/indexselect.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_INDEXSELECT_H
+#define LAYER_INDEXSELECT_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class IndexSelect : public Layer
+{
+public:
+    IndexSelect();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+public:
+    int dim;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_TOPK_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f55859e736ea..4f44e8c9825b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -115,6 +115,7 @@ ncnn_add_layer_test(GRU)
 ncnn_add_layer_test(HardSigmoid)
 ncnn_add_layer_test(HardSwish)
 ncnn_add_layer_test(InnerProduct)
+ncnn_add_layer_test(IndexSelect)
 ncnn_add_layer_test(InstanceNorm)
 ncnn_add_layer_test(Interp)
 ncnn_add_layer_test(InverseSpectrogram)
diff --git a/tests/test_indexselect.cpp b/tests/test_indexselect.cpp
new file mode 100644
index 000000000000..9577df813469
--- /dev/null
+++ b/tests/test_indexselect.cpp
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "testutil.h"
+
+static ncnn::Mat IntArrayMat(int a0)
+{
+    ncnn::Mat m(1);
+    int* p = m;
+    p[0] = a0;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1)
+{
+    ncnn::Mat m(2);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1, int a2)
+{
+    ncnn::Mat m(3);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    p[2] = a2;
+    return m;
+}
+
+static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3)
+{
+    ncnn::Mat m(4);
+    int* p = m;
+    p[0] = a0;
+    p[1] = a1;
+    p[2] = a2;
+    p[3] = a3;
+    return m;
+}
+
+static int test_index_select(const ncnn::Mat& a, const ncnn::Mat& index, int axis)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, axis);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    std::vector<ncnn::Mat> a0(2);
+    a0[0] = a;
+    a0[1] = index;
+
+    int ret = test_layer("IndexSelect", pd, weights, a0);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_index_select failed a.dims=%d a=(%d %d %d %d) index.w=%d axis=%d\n", a.dims, a.w, a.h, a.d, a.c, index.w, axis);
+    }
+
+    return ret;
+}
+
+static int test_index_select_0()
+{
+    return 0
+           || test_index_select(RandomMat(3, 4, 5, 6), IntArrayMat(1, 0), 0)
+           || test_index_select(RandomMat(4, 4, 5, 6), IntArrayMat(1, 0), 1)
+           || test_index_select(RandomMat(3, 4, 7, 6), IntArrayMat(1, 0), 2)
+           || test_index_select(RandomMat(7, 2, 5, 6), IntArrayMat(1, 0, 3), 3);
+}
+
+static int test_index_select_1()
+{
+    return 0
+           || test_index_select(RandomMat(2, 3, 5), IntArrayMat(1, 0), -3)
+           || test_index_select(RandomMat(4, 3, 5), IntArrayMat(0, 1), -2)
+           || test_index_select(RandomMat(6, 4, 5), IntArrayMat(2, 0), -1);
+}
+
+static int test_index_select_2()
+{
+    return 0
+           || test_index_select(RandomMat(8, 6), IntArrayMat(1, 4, 3), 0)
+           || test_index_select(RandomMat(8, 7), IntArrayMat(3, 5), 1);
+}
+
+static int test_index_select_3()
+{
+    return 0
+           || test_index_select(RandomMat(18), IntArrayMat(1, 7, 9, 15), -1);
+}
+
+int main()
+{
+    SRAND(7767517);
+    return 0
+           || test_index_select_0()
+           || test_index_select_1()
+           || test_index_select_2()
+           || test_index_select_3();
+}
\ No newline at end of file
diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt
index 42c3bed32e05..614f17e80d86 100644
--- a/tools/pnnx/tests/ncnn/CMakeLists.txt
+++ b/tools/pnnx/tests/ncnn/CMakeLists.txt
@@ -203,6 +203,7 @@ pnnx_ncnn_add_test(torch_square)
 pnnx_ncnn_add_test(torch_tan)
 pnnx_ncnn_add_test(torch_tanh)
 pnnx_ncnn_add_test(torch_trunc)
+pnnx_ncnn_add_test(torch_index_select)
 
 pnnx_ncnn_add_test(convnext_tiny)
 pnnx_ncnn_add_test(mobilenet_v2)
diff --git a/tools/pnnx/tests/ncnn/test_torch_index_select.py b/tools/pnnx/tests/ncnn/test_torch_index_select.py
new file mode 100644
index 000000000000..902159cc856b
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_index_select.py
@@ -0,0 +1,149 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# 由于ncnn的MemoryData层不支持int64，所以先浮点存档，再转回int64
+# class Model(nn.Module):
+#     def __init__(self):
+#         super(Model, self).__init__()
+
+#     def forward(self, x, y, z, d):
+#         # 1D
+#         x0 = torch.index_select(x, 0, torch.tensor([7, 9, 11]))
+#         # 2D
+#         y0 = torch.index_select(y, 0, torch.tensor([1, 0]))
+#         y1 = torch.index_select(y, 1, torch.tensor([1, 3, 2]))
+#         # 3D
+#         z0 = torch.index_select(z, -3, torch.tensor([1, 0]))
+#         z1 = torch.index_select(z, -2, torch.tensor([2, 0]))
+#         z2 = torch.index_select(z, -1, torch.tensor([1, 2]))
+#         # 4D
+#         d0 = torch.index_select(d, 0, torch.tensor([1, 0]))
+#         d1 = torch.index_select(d, 1, torch.tensor([1, 0]))
+#         d2 = torch.index_select(d, 2, torch.tensor([1, 0]))
+#         d3 = torch.index_select(d, 3, torch.tensor([1, 0]))
+
+#         return x0, y0, y1, z0, z1, z2, d0, d1, d2, d3
+
+
+# 成功
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        # 注册所有需要的索引缓冲区
+        self.register_buffer("idx_x", torch.tensor([7.0, 9.0, 11.0]))
+        self.register_buffer("idx_y0", torch.tensor([1.0, 0.0]))
+        self.register_buffer("idx_y1", torch.tensor([1.0, 3.0, 2.0]))
+        self.register_buffer("idx_z0", torch.tensor([1.0, 0.0]))
+        self.register_buffer("idx_z1", torch.tensor([2.0, 0.0]))
+        self.register_buffer("idx_z2", torch.tensor([1.0, 2.0, 0.0]))
+        self.register_buffer("idx_d0", torch.tensor([1.0, 0.0, 3.0]))
+        self.register_buffer("idx_d1", torch.tensor([0.0, 1.0]))
+        self.register_buffer("idx_d2", torch.tensor([4.0, 3.0, 0.0]))
+        self.register_buffer("idx_d3", torch.tensor([3.0, 6.0, 2.0]))
+
+    def float_to_int(self, idx_float):
+        mask = torch.ones_like(idx_float)
+        return (torch.max(idx_float * mask, mask * 0)).int()
+
+    def forward(self, x, y, z, d):
+        # 使用辅助函数进行转换
+        x0 = torch.index_select(x, 0, self.float_to_int(self.idx_x))
+        y0 = torch.index_select(y, 0, self.float_to_int(self.idx_y0))
+        y1 = torch.index_select(y, 1, self.float_to_int(self.idx_y1))
+        z0 = torch.index_select(z, -3, self.float_to_int(self.idx_z0))
+        z1 = torch.index_select(z, -2, self.float_to_int(self.idx_z1))
+        z2 = torch.index_select(z, -1, self.float_to_int(self.idx_z2))
+        d0 = torch.index_select(d, 0, self.float_to_int(self.idx_d0))
+        d1 = torch.index_select(d, 1, self.float_to_int(self.idx_d1))
+        d2 = torch.index_select(d, 2, self.float_to_int(self.idx_d2))
+        d3 = torch.index_select(d, 3, self.float_to_int(self.idx_d3))
+
+        return x0, y0, y1, z0, z1, z2, d0, d1, d2, d3
+
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(36)  # 1D
+    y = torch.rand(5, 7)  # 2D
+    z = torch.rand(3, 5, 8)  # 3D
+    d = torch.rand(4, 3, 6, 7)  # 4D
+
+    a = net(x, y, z, d)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z, d))
+    mod.save("test_torch_index_select.pt")
+
+    # torchscript to pnnx
+    import os
+
+    os.system(
+        "../../src/pnnx test_torch_index_select.pt inputshape=[36],[5,7],[3,5,8],[4,3,6,7]"
+    )
+
+    # pnnx inference
+    import test_torch_index_select_ncnn
+
+    b = test_torch_index_select_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-3, 1e-3):
+            return False
+    return True
+
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)

From 8266f4b7f7e43195fe5747a330196c61d6779b3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Thu, 16 Jan 2025 21:27:44 +0800
Subject: [PATCH 3/3] clean code

---
 .github/workflows/linux-x64-cpu-gcc.yml | 218 ++++++++++++------------
 1 file changed, 109 insertions(+), 109 deletions(-)

diff --git a/.github/workflows/linux-x64-cpu-gcc.yml b/.github/workflows/linux-x64-cpu-gcc.yml
index 31abbe47c251..ab2185be3e74 100644
--- a/.github/workflows/linux-x64-cpu-gcc.yml
+++ b/.github/workflows/linux-x64-cpu-gcc.yml
@@ -1,33 +1,33 @@
 name: linux-x64-cpu-gcc
 on:
   push:
-    # branches: [master]
+    branches: [master]
     paths:
-      - ".github/workflows/linux-x64-cpu-gcc.yml"
-      - "toolchains/host-c.gcc.toolchain.cmake"
-      - "CMakeLists.txt"
-      - "cmake/**"
-      - "src/*"
-      - "src/layer/*"
-      - "src/layer/x86/**"
-      - "tests/**"
-      - "tools/**"
-      - "!tools/pnnx/**"
-      - "examples/**"
+    - '.github/workflows/linux-x64-cpu-gcc.yml'
+    - 'toolchains/host-c.gcc.toolchain.cmake'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/*'
+    - 'src/layer/*'
+    - 'src/layer/x86/**'
+    - 'tests/**'
+    - 'tools/**'
+    - '!tools/pnnx/**'
+    - 'examples/**'
   pull_request:
     branches: [master]
     paths:
-      - ".github/workflows/linux-x64-cpu-gcc.yml"
-      - "toolchains/host-c.gcc.toolchain.cmake"
-      - "CMakeLists.txt"
-      - "cmake/**"
-      - "src/*"
-      - "src/layer/*"
-      - "src/layer/x86/**"
-      - "tests/**"
-      - "tools/**"
-      - "!tools/pnnx/**"
-      - "examples/**"
+    - '.github/workflows/linux-x64-cpu-gcc.yml'
+    - 'toolchains/host-c.gcc.toolchain.cmake'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/*'
+    - 'src/layer/*'
+    - 'src/layer/x86/**'
+    - 'tests/**'
+    - 'tools/**'
+    - '!tools/pnnx/**'
+    - 'examples/**'
 concurrency:
   group: linux-x64-cpu-gcc-${{ github.ref }}
   cancel-in-progress: true
@@ -38,97 +38,97 @@ jobs:
   linux-gcc:
     runs-on: ubuntu-20.04
     steps:
-      - uses: actions/checkout@v4
-      - name: update
-        run: sudo apt-get update
-      - name: protobuf
-        run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
-      - name: build-sse2
-        run: |
-          mkdir build-sse2 && cd build-sse2
-          cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
-          cmake --build . -j $(nproc)
-      - name: test-sse2
-        run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
-      - name: build-shared
-        run: |
-          mkdir build-shared && cd build-shared
-          cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
-          cmake --build . -j $(nproc)
-      - name: build-avx2
-        run: |
-          mkdir build-avx2 && cd build-avx2
-          cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
-          cmake --build . -j $(nproc)
-      - name: test-avx2
-        run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
-      - name: build-avx
-        run: |
-          mkdir build-avx && cd build-avx
-          cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-          cmake --build . -j $(nproc)
-      - name: test-avx
-        run: cd build-avx && ctest --output-on-failure -j $(nproc)
-      - name: build-avx1-2
-        run: |
-          mkdir build-avx1-2 && cd build-avx1-2
-          cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-          cmake --build . -j $(nproc)
-      - name: test-avx1-2
-        run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
-      - name: build-noint8
-        run: |
-          mkdir build-noint8 && cd build-noint8
-          cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
-          cmake --build . -j $(nproc)
-      - name: test-noint8
-        run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
+    - uses: actions/checkout@v4
+    - name: update
+      run: sudo apt-get update
+    - name: protobuf
+      run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
+    - name: build-sse2
+      run: |
+        mkdir build-sse2 && cd build-sse2
+        cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-sse2
+      run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
+    - name: build-shared
+      run: |
+        mkdir build-shared && cd build-shared
+        cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
+        cmake --build . -j $(nproc)
+    - name: build-avx2
+      run: |
+        mkdir build-avx2 && cd build-avx2
+        cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-avx2
+      run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
+    - name: build-avx
+      run: |
+        mkdir build-avx && cd build-avx
+        cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-avx
+      run: cd build-avx && ctest --output-on-failure -j $(nproc)
+    - name: build-avx1-2
+      run: |
+        mkdir build-avx1-2 && cd build-avx1-2
+        cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-avx1-2
+      run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
+    - name: build-noint8
+      run: |
+        mkdir build-noint8 && cd build-noint8
+        cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j $(nproc)
+    - name: test-noint8
+      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
 
   linux-gcc-cpp03-nostdio-nostring-simplestl:
     runs-on: ubuntu-20.04
     steps:
-      - uses: actions/checkout@v4
-      - name: build-nostdio
-        run: |
-          mkdir build-nostdio && cd build-nostdio
-          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-          cmake --build . -j $(nproc)
-      - name: test-nostdio
-        run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
-      - name: build-nostdio-nostring
-        run: |
-          mkdir build-nostdio-nostring && cd build-nostdio-nostring
-          cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-          cmake --build . -j $(nproc)
-      - name: build-simplestl
-        run: |
-          mkdir build-simplestl && cd build-simplestl
-          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-          cmake --build . -j $(nproc)
-      - name: test-simplestl
-        run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
-      - name: build-simplestl-simpleomp
-        run: |
-          mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
-          cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-          cmake --build . -j $(nproc)
-      - name: test-simplestl-simpleomp
-        run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
+    - uses: actions/checkout@v4
+    - name: build-nostdio
+      run: |
+        mkdir build-nostdio && cd build-nostdio
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: test-nostdio
+      run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
+    - name: build-nostdio-nostring
+      run: |
+        mkdir build-nostdio-nostring && cd build-nostdio-nostring
+        cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: build-simplestl
+      run: |
+        mkdir build-simplestl && cd build-simplestl
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: test-simplestl
+      run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
+    - name: build-simplestl-simpleomp
+      run: |
+        mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: test-simplestl-simpleomp
+      run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
 
   linux-gcc-avx512:
     runs-on: [self-hosted, linux, t4]
     steps:
-      - uses: actions/checkout@v4
-      - name: build
-        env:
-          CC: gcc
-          CXX: g++
-          LD_LIBRARY_PATH: /data/action/install/lib64
-        run: |
-          mkdir build && cd build
-          cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-          cmake --build . -j 4
-      - name: test
-        env:
-          LD_LIBRARY_PATH: /data/action/install/lib64
-        run: cd build && ctest --output-on-failure -j 4
+    - uses: actions/checkout@v4
+    - name: build
+      env:
+        CC: gcc
+        CXX: g++
+        LD_LIBRARY_PATH: /data/action/install/lib64
+      run: |
+        mkdir build && cd build
+        cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j 4
+    - name: test
+      env:
+        LD_LIBRARY_PATH: /data/action/install/lib64
+      run: cd build && ctest --output-on-failure -j 4