1
- import pandas as pd
1
+ import pandas
2
2
from labelbox import Client as labelboxClient
3
3
import labelbase
4
4
from labelpandas import connector
5
5
from tqdm import tqdm
6
6
from concurrent .futures import ThreadPoolExecutor , as_completed
7
7
8
- def create_data_row_upload_dict (client :labelboxClient , table :dict ,
8
+ def create_data_row_upload_dict (client :labelboxClient , table : pandas . core . frame . DataFrame , table_dict : dict ,
9
9
row_data_col :str , global_key_col :str , external_id_col :str , dataset_id_col :str ,
10
10
dataset_id :str , metadata_index :dict , attachment_index :dict ,
11
11
divider :str , verbose :bool , extra_client :bool = None ):
12
12
""" Multithreads over a Pandas DataFrame, calling create_data_rows() on each row to return an upload dictionary
13
13
Args:
14
14
client : Required (labelbox.client.Client) - Labelbox Client object
15
- table : Required (dict) - Pandas DataFrame as dict with df.to_dict("records")
15
+ table : Required (pandas.core.frame.DataFrame) - Pandas DataFrame
16
+ table_dict : Required (dict) - Pandas DataFrame as dict with df.to_dict("records")
16
17
row_data_col : Required (str) - Column containing asset URL or raw text
17
18
global_key_col : Required (str) - Column name containing the data row global key - defaults to row data
18
19
external_id_col : Required (str) - Column name containing the data row external ID - defaults to global key
@@ -28,18 +29,18 @@ def create_data_row_upload_dict(client:labelboxClient, table:dict,
28
29
- global_key_to_upload_dict - Dictionary where {key=global_key : value=data row dictionary in upload format}
29
30
- errors - List of dictionaries containing conversion error information; see connector.create_data_rows() for more information
30
31
"""
31
- table_length = len ( df_dict )
32
+ table_length = connector . get_table_length_function ( table = table )
32
33
if verbose :
33
34
print (f'Creating upload list - { table_length } rows in Pandas DataFrame' )
34
- unique_global_key_count = len (list ( set ([ str ( row_dict [ global_key_col ]) for row_dict in df_dict ]) ))
35
+ unique_global_key_count = len (connector . get_unique_values_function ( table = table , column_name = global_key_col ))
35
36
if table_length != unique_global_key_count :
36
37
print (f"Warning: Your global key column is not unique - upload will resume, only uploading 1 data row per unique global key" )
37
38
metadata_schema_to_name_key = labelbase .metadata .get_metadata_schema_to_name_key (client = client , lb_mdo = False , divider = divider , invert = False )
38
39
metadata_name_key_to_schema = labelbase .metadata .get_metadata_schema_to_name_key (client = client , lb_mdo = False , divider = divider , invert = True )
39
40
if dataset_id :
40
41
dataset_to_global_key_to_upload_dict = {dataset_id : {}}
41
42
else :
42
- dataset_to_global_key_to_upload_dict = {id : {} for id in list ( set ([ str ( row_dict [ dataset_id_col ]))) for row_dict in df_dict ])))}
43
+ dataset_to_global_key_to_upload_dict = {id : {} for id in connector . get_unique_values_function ( table = table , column_name = dataset_id_col )}
43
44
with ThreadPoolExecutor (max_workers = 8 ) as exc :
44
45
errors = []
45
46
futures = []
0 commit comments