Skip to content

Commit bfb69f0

Browse files
gtoonstrajinhyukchang
authored andcommitted
Implements BigQuery metadata extractor (#51)
* Implements BigQuery metadata extractor * Increment version * Increment version correctly to 1.2.0 * Set version to 1.1.0
1 parent bbefcd8 commit bfb69f0

File tree

4 files changed

+520
-1
lines changed

4 files changed

+520
-1
lines changed
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
import logging
2+
from collections import namedtuple
3+
4+
import google.oauth2.service_account
5+
import google_auth_httplib2
6+
from googleapiclient.discovery import build
7+
import httplib2
8+
from pyhocon import ConfigTree # noqa: F401
9+
from typing import List, Any # noqa: F401
10+
11+
from databuilder.extractor.base_extractor import Extractor
12+
from databuilder.models.table_metadata import TableMetadata, ColumnMetadata
13+
14+
15+
DatasetRef = namedtuple('DatasetRef', ['datasetId', 'projectId'])
16+
TableKey = namedtuple('TableKey', ['schema_name', 'table_name'])
17+
18+
LOGGER = logging.getLogger(__name__)
19+
20+
21+
class BigQueryMetadataExtractor(Extractor):
22+
23+
""" A metadata extractor for bigquery tables, taking the schema metadata
24+
from the google cloud bigquery API's. This extractor goes through all visible
25+
datasets in the project identified by project_id and iterates over all tables
26+
it finds. A separate account is configurable through the key_path parameter,
27+
which should point to a valid json file corresponding to a service account.
28+
29+
This extractor supports nested columns, which are delimited by a dot (.) in the
30+
column name.
31+
"""
32+
33+
PROJECT_ID_KEY = 'project_id'
34+
KEY_PATH_KEY = 'key_path'
35+
PAGE_SIZE_KEY = 'page_size'
36+
FILTER_KEY = 'filter'
37+
_DEFAULT_SCOPES = ('https://www.googleapis.com/auth/bigquery.readonly')
38+
DEFAULT_PAGE_SIZE = 300
39+
NUM_RETRIES = 3
40+
41+
def init(self, conf):
42+
# type: (ConfigTree) -> None
43+
self.key_path = conf.get_string(BigQueryMetadataExtractor.KEY_PATH_KEY, None)
44+
self.project_id = conf.get_string(BigQueryMetadataExtractor.PROJECT_ID_KEY)
45+
self.pagesize = conf.get_int(
46+
BigQueryMetadataExtractor.PAGE_SIZE_KEY,
47+
BigQueryMetadataExtractor.DEFAULT_PAGE_SIZE)
48+
self.filter = conf.get_string(BigQueryMetadataExtractor.FILTER_KEY, '')
49+
50+
if self.key_path:
51+
credentials = (
52+
google.oauth2.service_account.Credentials.from_service_account_file(
53+
self.key_path, scopes=BigQueryMetadataExtractor._DEFAULT_SCOPES))
54+
else:
55+
credentials, _ = google.auth.default(scopes=BigQueryMetadataExtractor._DEFAULT_SCOPES)
56+
57+
http = httplib2.Http()
58+
authed_http = google_auth_httplib2.AuthorizedHttp(credentials, http=http)
59+
self.bigquery_service = build('bigquery', 'v2', http=authed_http, cache_discovery=False)
60+
self.datasets = self._retrieve_datasets()
61+
self.iter = iter(self._iterate_over_tables())
62+
63+
def extract(self):
64+
# type: () -> Any
65+
try:
66+
return next(self.iter)
67+
except StopIteration:
68+
return None
69+
70+
def _iterate_over_tables(self):
71+
# type: () -> Any
72+
for dataset in self.datasets:
73+
for entry in self._retrieve_tables(dataset):
74+
yield(entry)
75+
76+
def _retrieve_datasets(self):
77+
# type: () -> List[DatasetRef]
78+
datasets = []
79+
for page in self._page_dataset_list_results():
80+
if 'datasets' not in page:
81+
continue
82+
83+
for dataset in page['datasets']:
84+
dataset_ref = dataset['datasetReference']
85+
ref = DatasetRef(**dataset_ref)
86+
datasets.append(ref)
87+
88+
return datasets
89+
90+
def _page_dataset_list_results(self):
91+
# type: () -> Any
92+
response = self.bigquery_service.datasets().list(
93+
projectId=self.project_id,
94+
all=False, # Do not return hidden datasets
95+
filter=self.filter,
96+
maxResults=self.pagesize).execute(
97+
num_retries=BigQueryMetadataExtractor.NUM_RETRIES)
98+
99+
while response:
100+
yield response
101+
102+
if 'nextPageToken' in response:
103+
response = self.bigquery_service.datasets().list(
104+
projectId=self.project_id,
105+
all=True,
106+
filter=self.filter,
107+
pageToken=response['nextPageToken']).execute(
108+
num_retries=BigQueryMetadataExtractor.NUM_RETRIES)
109+
else:
110+
response = None
111+
112+
def _retrieve_tables(self, dataset):
113+
# type: () -> Any
114+
for page in self._page_table_list_results(dataset):
115+
if 'tables' not in page:
116+
continue
117+
118+
for table in page['tables']:
119+
tableRef = table['tableReference']
120+
table = self.bigquery_service.tables().get(
121+
projectId=tableRef['projectId'],
122+
datasetId=tableRef['datasetId'],
123+
tableId=tableRef['tableId']).execute(num_retries=BigQueryMetadataExtractor.NUM_RETRIES)
124+
125+
# BigQuery tables also have interesting metadata about partitioning
126+
# data location (EU/US), mod/create time, etc... Extract that some other time?
127+
schema = table['schema']
128+
cols = []
129+
if 'fields' in schema:
130+
total_cols = 0
131+
for column in schema['fields']:
132+
total_cols = self._iterate_over_cols('', column, cols, total_cols + 1)
133+
134+
table_meta = TableMetadata(
135+
database='bigquery',
136+
cluster=tableRef['projectId'],
137+
schema_name=tableRef['datasetId'],
138+
name=tableRef['tableId'],
139+
description=table.get('description', ''),
140+
columns=cols,
141+
is_view=table['type'] == 'VIEW')
142+
143+
yield(table_meta)
144+
145+
def _iterate_over_cols(self, parent, column, cols, total_cols):
146+
# type: (str, str, List[ColumnMetadata()], int) -> int
147+
if len(parent) > 0:
148+
col_name = '{parent}.{field}'.format(parent=parent, field=column['name'])
149+
else:
150+
col_name = column['name']
151+
152+
if column['type'] == 'RECORD':
153+
col = ColumnMetadata(
154+
name=col_name,
155+
description=column.get('description', ''),
156+
col_type=column['type'],
157+
sort_order=total_cols)
158+
cols.append(col)
159+
total_cols += 1
160+
for field in column['fields']:
161+
total_cols = self._iterate_over_cols(col_name, field, cols, total_cols)
162+
else:
163+
col = ColumnMetadata(
164+
name=col_name,
165+
description=column.get('description', ''),
166+
col_type=column['type'],
167+
sort_order=total_cols)
168+
cols.append(col)
169+
return total_cols + 1
170+
171+
def _page_table_list_results(self, dataset):
172+
# type: (DatasetRef) -> Any
173+
response = self.bigquery_service.tables().list(
174+
projectId=dataset.projectId,
175+
datasetId=dataset.datasetId,
176+
maxResults=self.pagesize).execute(
177+
num_retries=BigQueryMetadataExtractor.NUM_RETRIES)
178+
179+
while response:
180+
yield response
181+
182+
if 'nextPageToken' in response:
183+
response = self.bigquery_service.datasets().list(
184+
projectId=dataset.projectId,
185+
datasetId=dataset.datasetId,
186+
maxRResults=self.pagesize,
187+
pageToken=response['nextPageToken']).execute(
188+
num_retries=BigQueryMetadataExtractor.NUM_RETRIES)
189+
else:
190+
response = None
191+
192+
def get_scope(self):
193+
# type: () -> str
194+
return 'extractor.bigquery_table_metadata'
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""
2+
This is a example script for extracting BigQuery usage results
3+
"""
4+
5+
import logging
6+
from pyhocon import ConfigFactory
7+
import sqlite3
8+
9+
from databuilder.extractor.bigquery_metadata_extractor import BigQueryMetadataExtractor
10+
from databuilder.job.job import DefaultJob
11+
from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader
12+
from databuilder.publisher import neo4j_csv_publisher
13+
from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher
14+
from databuilder.task.task import DefaultTask
15+
from databuilder.transformer.base_transformer import NoopTransformer
16+
17+
logging.basicConfig(level=logging.INFO)
18+
19+
# replace localhost with docker host ip
20+
# todo: get the ip from input argument
21+
NEO4J_ENDPOINT = 'bolt://localhost:7687'
22+
neo4j_endpoint = NEO4J_ENDPOINT
23+
24+
neo4j_user = 'neo4j'
25+
neo4j_password = 'test'
26+
27+
28+
def create_connection(db_file):
29+
try:
30+
conn = sqlite3.connect(db_file)
31+
return conn
32+
except Exception:
33+
logging.exception('exception')
34+
return None
35+
36+
37+
# todo: Add a second model
38+
def create_bq_job(metadata_type, gcloud_project):
39+
tmp_folder = '/var/tmp/amundsen/{metadata_type}'.format(metadata_type=metadata_type)
40+
node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
41+
relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)
42+
43+
bq_meta_extractor = BigQueryMetadataExtractor()
44+
csv_loader = FsNeo4jCSVLoader()
45+
46+
task = DefaultTask(extractor=bq_meta_extractor,
47+
loader=csv_loader,
48+
transformer=NoopTransformer())
49+
50+
job_config = ConfigFactory.from_dict({
51+
'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY):
52+
gcloud_project,
53+
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
54+
node_files_folder,
55+
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
56+
relationship_files_folder,
57+
'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
58+
True,
59+
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
60+
node_files_folder,
61+
'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
62+
relationship_files_folder,
63+
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
64+
neo4j_endpoint,
65+
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
66+
neo4j_user,
67+
'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
68+
neo4j_password,
69+
'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
70+
'unique_tag', # should use unique tag here like {ds}
71+
})
72+
job = DefaultJob(conf=job_config,
73+
task=task,
74+
publisher=Neo4jCsvPublisher())
75+
return job
76+
77+
78+
if __name__ == "__main__":
79+
# start table job
80+
job1 = create_bq_job('bigquery_metadata', 'your-project-here')
81+
job1.launch()

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from setuptools import setup, find_packages
22

33

4-
__version__ = '1.0.15'
4+
__version__ = '1.1.0'
55

66

77
setup(

0 commit comments

Comments
 (0)