Skip to content

Commit 8b6d082

Browse files
authored
Merge pull request #91 from FengZiYjun/master
Merge Preprocessor into DataSet.
2 parents 281b567 + 81790d7 commit 8b6d082

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+2249
-67893
lines changed

.travis.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ python:
55
install:
66
- pip install --quiet -r requirements.txt
77
- pip install pytest pytest-cov
8-
- pip install -U scikit-learn
98
# command to run tests
109
script:
1110
- pytest --cov=./

README.md

Lines changed: 30 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -30,77 +30,36 @@ Run the following commands to install fastNLP package.
3030
pip install fastNLP
3131
```
3232

33-
### Cloning From GitHub
34-
35-
If you just want to use fastNLP, use:
36-
```shell
37-
git clone https://github.com/fastnlp/fastNLP
38-
cd fastNLP
39-
```
40-
41-
### PyTorch Installation
42-
43-
Visit the [PyTorch official website] for installation instructions based on your system. In general, you could use:
44-
```shell
45-
# using conda
46-
conda install pytorch torchvision -c pytorch
47-
# or using pip
48-
pip3 install torch torchvision
49-
```
50-
51-
### TensorboardX Installation
52-
53-
```shell
54-
pip3 install tensorboardX
55-
```
5633

5734
## Project Structure
5835

59-
```
60-
FastNLP
61-
├── docs
62-
├── fastNLP
63-
│   ├── core
64-
│   │   ├── action.py
65-
│   │   ├── __init__.py
66-
│   │   ├── loss.py
67-
│   │   ├── metrics.py
68-
│   │   ├── optimizer.py
69-
│   │   ├── predictor.py
70-
│   │   ├── preprocess.py
71-
│   │   ├── README.md
72-
│   │   ├── tester.py
73-
│   │   └── trainer.py
74-
│   ├── fastnlp.py
75-
│   ├── __init__.py
76-
│   ├── loader
77-
│   │   ├── base_loader.py
78-
│   │   ├── config_loader.py
79-
│   │   ├── dataset_loader.py
80-
│   │   ├── embed_loader.py
81-
│   │   ├── __init__.py
82-
│   │   └── model_loader.py
83-
│   ├── models
84-
│   ├── modules
85-
│   │   ├── aggregation
86-
│   │   ├── decoder
87-
│   │   ├── encoder
88-
│   │   ├── __init__.py
89-
│   │   ├── interaction
90-
│   │   ├── other_modules.py
91-
│   │   └── utils.py
92-
│   └── saver
93-
├── LICENSE
94-
├── README.md
95-
├── reproduction
96-
├── requirements.txt
97-
├── setup.py
98-
└── test
99-
├── core
100-
├── data_for_tests
101-
├── __init__.py
102-
├── loader
103-
├── modules
104-
└── readme_example.py
105-
106-
```
36+
<table>
37+
<tr>
38+
<td><b> fastNLP </b></td>
39+
<td> an open-source NLP library </td>
40+
</tr>
41+
<tr>
42+
<td><b> fastNLP.core </b></td>
43+
<td> trainer, tester, predictor </td>
44+
</tr>
45+
<tr>
46+
<td><b> fastNLP.loader </b></td>
47+
<td> all kinds of loaders/readers </td>
48+
</tr>
49+
<tr>
50+
<td><b> fastNLP.models </b></td>
51+
<td> a collection of NLP models </td>
52+
</tr>
53+
<tr>
54+
<td><b> fastNLP.modules </b></td>
55+
<td> a collection of PyTorch sub-models/components/wheels </td>
56+
</tr>
57+
<tr>
58+
<td><b> fastNLP.saver </b></td>
59+
<td> all kinds of savers/writers </td>
60+
</tr>
61+
<tr>
62+
<td><b> fastNLP.fastnlp </b></td>
63+
<td> a high-level interface for prediction </td>
64+
</tr>
65+
</table>

docs/source/user/quickstart.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ pre-processing data, constructing model and training model.
1818
from fastNLP.modules import aggregation
1919
from fastNLP.modules import decoder
2020
21-
from fastNLP.loader.dataset_loader import ClassDatasetLoader
21+
from fastNLP.loader.dataset_loader import ClassDataSetLoader
2222
from fastNLP.loader.preprocess import ClassPreprocess
2323
from fastNLP.core.trainer import ClassificationTrainer
2424
from fastNLP.core.inference import ClassificationInfer
@@ -50,7 +50,7 @@ pre-processing data, constructing model and training model.
5050
train_path = 'test/data_for_tests/text_classify.txt' # training set file
5151
5252
# load dataset
53-
ds_loader = ClassDatasetLoader("train", train_path)
53+
ds_loader = ClassDataSetLoader("train", train_path)
5454
data = ds_loader.load()
5555
5656
# pre-process dataset

examples/readme_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from fastNLP.core.predictor import ClassificationInfer
44
from fastNLP.core.preprocess import ClassPreprocess
55
from fastNLP.core.trainer import ClassificationTrainer
6-
from fastNLP.loader.dataset_loader import ClassDatasetLoader
6+
from fastNLP.loader.dataset_loader import ClassDataSetLoader
77
from fastNLP.models.base_model import BaseModel
88
from fastNLP.modules import aggregator
99
from fastNLP.modules import decoder
@@ -36,7 +36,7 @@ def forward(self, x):
3636
train_path = './data_for_tests/text_classify.txt' # training set file
3737

3838
# load dataset
39-
ds_loader = ClassDatasetLoader(train_path)
39+
ds_loader = ClassDataSetLoader()
4040
data = ds_loader.load()
4141

4242
# pre-process dataset

fastNLP/core/batch.py

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def __init__(self, dataset, batch_size, sampler, use_cuda):
1717
:param dataset: a DataSet object
1818
:param batch_size: int, the size of the batch
1919
:param sampler: a Sampler object
20-
:param use_cuda: bool, whetjher to use GPU
20+
:param use_cuda: bool, whether to use GPU
2121
2222
"""
2323
self.dataset = dataset
@@ -37,15 +37,12 @@ def __next__(self):
3737
"""
3838
3939
:return batch_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [batch_size, padding_length])
40-
batch_x also contains an item (str: list of int) about origin lengths,
41-
which means ("field_name_origin_len": origin lengths).
4240
E.g.
4341
::
4442
{'text': tensor([[ 0, 1, 2, 3, 0, 0, 0], 4, 5, 2, 6, 7, 8, 9]]), 'text_origin_len': [4, 7]})
4543
4644
batch_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [batch_size, padding_length])
4745
All tensors in both batch_x and batch_y will be cuda tensors if use_cuda is True.
48-
The names of fields are defined in preprocessor's convert_to_dataset method.
4946
5047
"""
5148
if self.curidx >= len(self.idx_list):
@@ -54,34 +51,24 @@ def __next__(self):
5451
endidx = min(self.curidx + self.batch_size, len(self.idx_list))
5552
padding_length = {field_name: max(field_length[self.curidx: endidx])
5653
for field_name, field_length in self.lengths.items()}
57-
origin_lengths = {field_name: field_length[self.curidx: endidx]
58-
for field_name, field_length in self.lengths.items()}
59-
6054
batch_x, batch_y = defaultdict(list), defaultdict(list)
55+
56+
# transform index to tensor and do padding for sequences
6157
for idx in range(self.curidx, endidx):
6258
x, y = self.dataset.to_tensor(idx, padding_length)
6359
for name, tensor in x.items():
6460
batch_x[name].append(tensor)
6561
for name, tensor in y.items():
6662
batch_y[name].append(tensor)
6763

68-
batch_origin_length = {}
69-
# combine instances into a batch
64+
# combine instances to form a batch
7065
for batch in (batch_x, batch_y):
7166
for name, tensor_list in batch.items():
7267
if self.use_cuda:
7368
batch[name] = torch.stack(tensor_list, dim=0).cuda()
7469
else:
7570
batch[name] = torch.stack(tensor_list, dim=0)
7671

77-
# add origin lengths in batch_x
78-
for name, tensor in batch_x.items():
79-
if self.use_cuda:
80-
batch_origin_length[name + "_origin_len"] = torch.LongTensor(origin_lengths[name]).cuda()
81-
else:
82-
batch_origin_length[name + "_origin_len"] = torch.LongTensor(origin_lengths[name])
83-
batch_x.update(batch_origin_length)
84-
8572
self.curidx = endidx
8673
return batch_x, batch_y
8774

0 commit comments

Comments
 (0)