Skip to content

Commit 7eaa428

Browse files
d116626Hellcassiusmergify[bot]
authored
[infra] Python 1.6.4 (#1188)
* feat: refactor update columns function in table.py * expose chunk_size parameter * fix: make staging data acessible * fix: make staging data acessible * fix: add parquet to storage options * pump version * fix: change bd_bdm_table_schema to new format * feat: test mergfy and pylint * fix: change spatial_coverage_tree to its own endpoint * feat: publish python-1.6.4 * Update table-approve.yml Co-authored-by: hellcassius <caiorogerio.santos@gmail.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
1 parent 00fd76f commit 7eaa428

File tree

9 files changed

+105
-41
lines changed

9 files changed

+105
-41
lines changed

.github/workflows/data-check.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ jobs:
5959
- name: Install dependencies
6060
run: |
6161
python -m pip install --upgrade pip
62-
pip install basedosdados==1.6.4b5 pyarrow pytest toml
62+
pip install basedosdados==1.6.4 pyarrow pytest toml
6363
- name: Set up base dos dados environment
6464
shell: bash
6565
env:

.github/workflows/metadata-validate.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
- name: Install dependencies
3737
run: |
3838
python -m pip install --upgrade pip
39-
pip install basedosdados==1.6.4b5 toml
39+
pip install basedosdados==1.6.4 toml
4040
- name: Set up base dos dados environment
4141
run: python .github/workflows/env-setup/env_setup.py
4242
shell: bash

.github/workflows/table-approve.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
- name: Install dependencies
3737
run: |
3838
python -m pip install --upgrade pip
39-
pip install basedosdados==1.6.4b5 toml
39+
pip install basedosdados==1.6.4 toml
4040
- name: Set up gcloud
4141
uses: google-github-actions/setup-gcloud@v0
4242
with:
@@ -116,7 +116,7 @@ jobs:
116116
- name: Install dependencies
117117
run: |
118118
python -m pip install --upgrade pip
119-
pip install basedosdados==1.6.4b5 pyarrow pytest toml
119+
pip install basedosdados==1.6.4 pyarrow pytest toml
120120
- name: Set up basedosdados environment
121121
run: |
122122
cd .github/workflows/env-setup

python-package/basedosdados/cli/cli.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,11 @@ def init_table(
280280
default=None,
281281
help="Location of dataset data. List of possible region names locations: https://cloud.google.com/bigquery/docs/locations",
282282
)
283+
@click.option(
284+
"--chunk_size",
285+
default=None,
286+
help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.",
287+
)
283288
@click.pass_context
284289
def create_table(
285290
ctx,
@@ -295,6 +300,7 @@ def create_table(
295300
columns_config_url_or_path,
296301
dataset_is_public,
297302
location,
303+
chunk_size,
298304
):
299305

300306
Table(table_id=table_id, dataset_id=dataset_id, **ctx.obj).create(
@@ -308,6 +314,7 @@ def create_table(
308314
columns_config_url_or_path=columns_config_url_or_path,
309315
dataset_is_public=dataset_is_public,
310316
location=location,
317+
chunk_size=chunk_size,
311318
)
312319

313320
click.echo(
@@ -428,11 +435,21 @@ def delete_table(ctx, dataset_id, table_id, mode):
428435
default="raise",
429436
help="[raise|replace|pass] if file alread exists",
430437
)
438+
@click.option(
439+
"--chunk_size",
440+
default=None,
441+
help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.",
442+
)
431443
@click.pass_context
432-
def upload_table(ctx, dataset_id, table_id, filepath, partitions, if_exists):
444+
def upload_table(
445+
ctx, dataset_id, table_id, filepath, partitions, if_exists, chunk_size
446+
):
433447

434448
blob_name = Table(table_id=table_id, dataset_id=dataset_id, **ctx.obj).append(
435-
filepath=filepath, partitions=partitions, if_exists=if_exists
449+
filepath=filepath,
450+
partitions=partitions,
451+
if_exists=if_exists,
452+
chunk_size=chunk_size,
436453
)
437454

438455
click.echo(
@@ -493,12 +510,23 @@ def init_storage(ctx, bucket_name, replace, very_sure):
493510
default="raise",
494511
help="[raise|replace|pass] if file alread exists",
495512
)
513+
@click.option(
514+
"--chunk_size",
515+
default=None,
516+
help="The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification.",
517+
)
496518
@click.pass_context
497-
def upload_storage(ctx, dataset_id, table_id, filepath, mode, partitions, if_exists):
519+
def upload_storage(
520+
ctx, dataset_id, table_id, filepath, mode, partitions, if_exists, chunk_size
521+
):
498522

499523
ctx.obj.pop("bucket_name")
500524
blob_name = Storage(dataset_id, table_id, **ctx.obj).upload(
501-
filepath=filepath, mode=mode, partitions=partitions, if_exists=if_exists
525+
filepath=filepath,
526+
mode=mode,
527+
partitions=partitions,
528+
if_exists=if_exists,
529+
chunk_size=chunk_size,
502530
)
503531

504532
click.echo(

python-package/basedosdados/upload/dataset.py

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -120,28 +120,39 @@ def publicize(self, mode="all", dataset_is_public=True):
120120
dataset = m["client"].get_dataset(m["id"])
121121
entries = dataset.access_entries
122122
# TODO https://github.com/basedosdados/mais/pull/1020
123-
if dataset_is_public and "staging" not in dataset.dataset_id:
124-
entries.extend(
125-
[
126-
bigquery.AccessEntry(
127-
role="roles/bigquery.dataViewer",
128-
entity_type="iamMember",
129-
entity_id="allUsers",
130-
),
131-
bigquery.AccessEntry(
132-
role="roles/bigquery.metadataViewer",
133-
entity_type="iamMember",
134-
entity_id="allUsers",
135-
),
136-
bigquery.AccessEntry(
137-
role="roles/bigquery.user",
138-
entity_type="iamMember",
139-
entity_id="allUsers",
140-
),
141-
]
142-
)
123+
# TODO if staging dataset is private, the prod view can't acess it: if dataset_is_public and "staging" not in dataset.dataset_id:
124+
if dataset_is_public:
125+
if "staging" not in dataset.dataset_id:
126+
entries.extend(
127+
[
128+
bigquery.AccessEntry(
129+
role="roles/bigquery.dataViewer",
130+
entity_type="iamMember",
131+
entity_id="allUsers",
132+
),
133+
bigquery.AccessEntry(
134+
role="roles/bigquery.metadataViewer",
135+
entity_type="iamMember",
136+
entity_id="allUsers",
137+
),
138+
bigquery.AccessEntry(
139+
role="roles/bigquery.user",
140+
entity_type="iamMember",
141+
entity_id="allUsers",
142+
),
143+
]
144+
)
145+
else:
146+
entries.extend(
147+
[
148+
bigquery.AccessEntry(
149+
role="roles/bigquery.dataViewer",
150+
entity_type="iamMember",
151+
entity_id="allUsers",
152+
),
153+
]
154+
)
143155
dataset.access_entries = entries
144-
145156
m["client"].update_dataset(dataset, ["access_entries"])
146157
logger.success(
147158
" {object} {object_id}_{mode} was {action}!",

python-package/basedosdados/upload/metadata.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -212,14 +212,10 @@ def metadata_schema(self) -> dict:
212212

213213
if self.table_id:
214214
table_url = f"{self.CKAN_URL}/api/3/action/bd_bdm_table_schema"
215-
table_schema = requests.get(table_url).json().get("result")
216-
217-
return table_schema
215+
return requests.get(table_url).json().get("result")
218216

219217
dataset_url = f"{self.CKAN_URL}/api/3/action/bd_dataset_schema"
220-
dataset_schema = requests.get(dataset_url).json().get("result")
221-
222-
return dataset_schema
218+
return requests.get(dataset_url).json().get("result")
223219

224220
def exists_in_ckan(self) -> bool:
225221
"""Check if Metadata object refers to an existing CKAN package or reso

python-package/basedosdados/upload/storage.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ def upload(
113113
mode="all",
114114
partitions=None,
115115
if_exists="raise",
116+
chunk_size=None,
116117
**upload_args,
117118
):
118119
"""Upload to storage at `<bucket_name>/<mode>/<dataset_id>/<table_id>`. You can:
@@ -158,6 +159,10 @@ def upload(
158159
* 'raise' : Raises Conflict exception
159160
* 'replace' : Replace table
160161
* 'pass' : Do nothing
162+
chunk_size (int): Optional
163+
The size of a chunk of data whenever iterating (in bytes).
164+
This must be a multiple of 256 KB per the API specification.
165+
If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used.
161166
162167
upload_args ():
163168
Extra arguments accepted by [`google.cloud.storage.blob.Blob.upload_from_file`](https://googleapis.dev/python/storage/latest/blobs.html?highlight=upload_from_filename#google.cloud.storage.blob.Blob.upload_from_filename)
@@ -169,7 +174,11 @@ def upload(
169174
path = Path(path)
170175

171176
if path.is_dir():
172-
paths = [f for f in path.glob("**/*") if f.is_file() and f.suffix == ".csv"]
177+
paths = [
178+
f
179+
for f in path.glob("**/*")
180+
if f.is_file() and f.suffix in [".csv", ".parquet", "parquet.gzip"]
181+
]
173182

174183
parts = [
175184
(
@@ -197,7 +206,7 @@ def upload(
197206

198207
blob_name = self._build_blob_name(filepath.name, m, part)
199208

200-
blob = self.bucket.blob(blob_name)
209+
blob = self.bucket.blob(blob_name, chunk_size=chunk_size)
201210

202211
if not blob.exists() or if_exists == "replace":
203212

python-package/basedosdados/upload/table.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,7 @@ def create(
567567
columns_config_url_or_path=None,
568568
dataset_is_public=True,
569569
location=None,
570+
chunk_size=None,
570571
):
571572
"""Creates BigQuery table at staging dataset.
572573
@@ -626,6 +627,10 @@ def create(
626627
location (str): Optional. Location of dataset data.
627628
List of possible region names locations: https://cloud.google.com/bigquery/docs/locations
628629
630+
chunk_size (int): Optional
631+
The size of a chunk of data whenever iterating (in bytes).
632+
This must be a multiple of 256 KB per the API specification.
633+
If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used.
629634
"""
630635

631636
if path is None:
@@ -651,7 +656,10 @@ def create(
651656
):
652657

653658
Storage(self.dataset_id, self.table_id, **self.main_vars).upload(
654-
path, mode="staging", if_exists=if_storage_data_exists
659+
path,
660+
mode="staging",
661+
if_exists=if_storage_data_exists,
662+
chunk_size=chunk_size,
655663
)
656664

657665
# Create Dataset if it doesn't exist
@@ -835,7 +843,14 @@ def delete(self, mode):
835843
action="deleted",
836844
)
837845

838-
def append(self, filepath, partitions=None, if_exists="replace", **upload_args):
846+
def append(
847+
self,
848+
filepath,
849+
partitions=None,
850+
if_exists="replace",
851+
chunk_size=None,
852+
**upload_args,
853+
):
839854
"""Appends new data to existing BigQuery table.
840855
841856
As long as the data has the same schema. It appends the data in the
@@ -854,6 +869,10 @@ def append(self, filepath, partitions=None, if_exists="replace", **upload_args):
854869
* 'raise' : Raises Conflict exception
855870
* 'replace' : Replace table
856871
* 'pass' : Do nothing
872+
chunk_size (int): Optional
873+
The size of a chunk of data whenever iterating (in bytes).
874+
This must be a multiple of 256 KB per the API specification.
875+
If not specified, the chunk_size of the blob itself is used. If that is not specified, a default value of 40 MB is used.
857876
"""
858877
if not self.table_exists("staging"):
859878
raise BaseDosDadosException(
@@ -865,6 +884,7 @@ def append(self, filepath, partitions=None, if_exists="replace", **upload_args):
865884
mode="staging",
866885
partitions=partitions,
867886
if_exists=if_exists,
887+
chunk_size=chunk_size,
868888
**upload_args,
869889
)
870890
logger.success(

python-package/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ packages = [
1313
]
1414
readme = "README.md"
1515
repository = "https://github.com/base-dos-dados/bases"
16-
version = "1.6.3-beta.2"
16+
version = "1.6.4"
1717

1818
[tool.poetry.scripts]
1919
basedosdados = 'basedosdados.cli.cli:cli'
@@ -26,6 +26,7 @@ click = "8.0.3"
2626
google-cloud-bigquery = "2.30.1"
2727
google-cloud-bigquery-storage = "1.1.0"
2828
google-cloud-storage = "1.42.3"
29+
importlib-metadata = "^4.11.3"
2930
ipykernel = "5.3.4"
3031
jupyter = "^1.0.0"
3132
loguru = "^0.6.0"
@@ -44,7 +45,6 @@ python = ">=3.7.1,<3.11"
4445
toml = "^0.10.2"
4546
tomlkit = "0.7.0"
4647
tqdm = "4.50.2"
47-
importlib-metadata = "^4.11.3"
4848

4949
[tool.black]
5050
# Use the more relaxed max line length permitted in PEP8.

0 commit comments

Comments
 (0)