Skip to content

Metadata field can't handle strings longer than 255 characters #23

@yonomitt

Description

@yonomitt

I tried to upload image captions as a metadata point. The idea being I could then filter the dataset based on the contents of the captions. I ran into an error:

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 542247/542247 [00:00<00:00, 2294043.12it/s]
  0%|                                                                                                                                                                                                                                                                   | 0/1 [00:00<?, ?it/s]
---------------------------------------------------------------------------
TransportQueryError                       Traceback (most recent call last)
Cell In[13], line 15
     12 for start in tqdm(range(0, total, batch)):
     13     data = all_metadata[start:start+batch]
---> 15     with ds.metadata_context() as ctx, open(annotations_file) as f:
     16         for image, metadata in data:
     17             ctx.update_metadata(image, metadata)

File ~/.miniforge3/envs/dagstest/lib/python3.10/contextlib.py:142, in _GeneratorContextManager.__exit__(self, typ, value, traceback)
    140 if typ is None:
    141     try:
--> 142         next(self.gen)
    143     except StopIteration:
    144         return False

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/dagshub/data_engine/model/datasource.py:118, in Datasource.metadata_context(self)
    116 ctx = MetadataContextManager(self)
    117 yield ctx
--> 118 self._upload_metadata(ctx.get_metadata_entries())

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/dagshub/data_engine/model/datasource.py:183, in Datasource._upload_metadata(self, metadata_entries)
    182 def _upload_metadata(self, metadata_entries: List[DatapointMetadataUpdateEntry]):
--> 183     self.source.client.update_metadata(self, metadata_entries)

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/dagshub/data_engine/client/data_client.py:109, in DataClient.update_metadata(self, datasource, entries)
    102 assert len(entries) > 0
    104 params = GqlMutations.update_metadata_params(
    105     datasource_id=datasource.source.id,
    106     datapoints=[e.to_dict() for e in entries]
    107 )
--> 109 return self._exec(q, params)

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/dagshub/data_engine/client/data_client.py:82, in DataClient._exec(self, query, params)
     80     logger.debug(f"Params: {params}")
     81 q = gql.gql(query)
---> 82 resp = self.client.execute(q, variable_values=params)
     83 return resp

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/gql/client.py:403, in Client.execute(self, document, variable_values, operation_name, serialize_variables, parse_result, get_execution_result, **kwargs)
    400     return data
    402 else:  # Sync transports
--> 403     return self.execute_sync(
    404         document,
    405         variable_values=variable_values,
    406         operation_name=operation_name,
    407         serialize_variables=serialize_variables,
    408         parse_result=parse_result,
    409         get_execution_result=get_execution_result,
    410         **kwargs,
    411     )

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/gql/client.py:221, in Client.execute_sync(self, document, variable_values, operation_name, serialize_variables, parse_result, get_execution_result, **kwargs)
    219 """:meta private:"""
    220 with self as session:
--> 221     return session.execute(
    222         document,
    223         variable_values=variable_values,
    224         operation_name=operation_name,
    225         serialize_variables=serialize_variables,
    226         parse_result=parse_result,
    227         get_execution_result=get_execution_result,
    228         **kwargs,
    229     )

File ~/.miniforge3/envs/dagstest/lib/python3.10/site-packages/gql/client.py:860, in SyncClientSession.execute(self, document, variable_values, operation_name, serialize_variables, parse_result, get_execution_result, **kwargs)
    858 # Raise an error if an error is returned in the ExecutionResult object
    859 if result.errors:
--> 860     raise TransportQueryError(
    861         str(result.errors[0]),
    862         errors=result.errors,
    863         data=result.data,
    864         extensions=result.extensions,
    865     )
    867 assert (
    868     result.data is not None
    869 ), "Transport returned an ExecutionResult without data or errors"
    871 if get_execution_result:

TransportQueryError: {'message': 'pq: value too long for type character varying(255)', 'path': ['updateMetadata']}

This was the code:

annotations_file = 'labels.tsv'

all_metadata = []
with open(annotations_file) as f:
    for row in tqdm(f.readlines()):
        image, caption, score = row.split('\t')[:3]
    all_metadata.append((image, {'caption': caption, 'score': score}))

total = len(all_metadata)

batch = 1000
for start in tqdm(range(0, total, batch)):
    data = all_metadata[start:start+batch]

    with ds.metadata_context() as ctx, open(annotations_file) as f:
        for image, metadata in data:
            ctx.update_metadata(image, metadata)

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions