使用自定义schema时在create_collection中创建动态字段失效 #41316

IeohMingChan · 2025-04-15T08:51:24Z

IeohMingChan
Apr 15, 2025

背景：
我已通过自定义schema创建一个集合，在创建集合时设置动态字段：client.create_collection(new_collection_name, schema=schema, enable_dynamic_field=True)并写入若干数据。随后，我尝试在该集合中插入包含未定义字段的值时发生报错：Error fetching documents: <DataNotMatchException: (code=1, message=Attempt to insert an unexpected field dynamic_fields_1 to collection without enabling dynamic field)>
经过测试发现，使用自定义schema时必须在client.create_schema方法中设置动态字段client.create_schema(enable_dynamic_field=True)，而create_collection中所设动态字段的值失效，即只要不在create_schema中设置动态字段，无论在create_collection设置何值，动态字段均不启用。
问题：我目前已创建集合并写入海量数据，当时是根据官方文档通过create_collection设置动态字段，而不是create_schema。现在我希望往该集合中写入未定义的字段，请问可以如何设置使该集合启用动态字段以避免重写数据，还是必须重建集合并通过create_schema启用动态字段？
所用代码如下，其中create_schema及create_collection中动态字段值已尝试过各种值以印证上述结论。感谢

相关代码

import concurrent.futures

import re
import time

from loguru import logger

from pymilvus import connections, Collection, utility, DataType, MilvusClient, FunctionType, Function

num_entities = []
insert_counts = []
kb_ids = []

host = "xxx"
port = xxx

uri = f"http://{host}:{port}"
old_db = "knowledge"
new_db = "knowledge_base"
new_collection_name = "knowledge_xiaobu_3"
is_create_collection = False

connections.connect(alias='source', host=host, port=port, db_name=old_db, user="root", password="Milvus")

def generate_binary_string(input_list):
# 初始化一个长度为1024的字符串，全部填充为'0'
result = ['0'] * 1024

# 遍历输入的列表
for index in input_list:
    # 检查索引是否在有效范围内
    if 0 <= index < 1024:
        # 将对应位置的字符改为'1'
        result[index] = '1'
    else:
        print(f"Warning: Index {index} is out of range (0-1023). Ignoring this value.")

# 将列表转换为字符串并返回
return ''.join(result)

def get_kb_id(source):

res = re.findall(r'/(\d+)/', source)
if res:
    return int(res[-1])
return 0

def fetch_documents_from_collection(collection: Collection, new_collection: str, client: MilvusClient, partition_name: str, doc_type: str):
try:

    if not client.has_partition(collection_name=new_collection, partition_name=partition_name):
        client.create_partition(
            collection_name=new_collection,
            partition_name=partition_name
        )

    output_fields = ['source', 'seq_num', 'slice_id', 'slice_type', 'full_text', 'text', 'vector']
    total_docs = 0
    collection.load()

    iterator = collection.query_iterator(
        output_fields=output_fields,
        batch_size=100,
        limit=-1
    )

    documents = []
    kb_id = None
    while True:
        result = iterator.next()
        if not result:
            break

        texts = [item['text'] for item in result]
        total_docs = total_docs + len(result)

        for index, record in enumerate(result):
            vector = record['vector']
            source = record['source']
            text = record['text'].strip()
            full_text = record['full_text'].strip()
            seq_num = int(record['seq_num']) if record['seq_num'].strip() != '' else -1
            kb_id = get_kb_id(source)

            # Append the cleaned data to the documents list
            documents.append({
                'source': source,
                'seq_num': seq_num,
                'kb_id': kb_id,
                'slice_id': record['slice_id'],
                'slice_type': record['slice_type'],
                'full_text': full_text,
                'text': text,
                'vector': vector,
                'doc_type': doc_type,
                'dynamic_fields_1': "动态字段测试1" + record['slice_id'],
                'dynamic_fields_2': "动态字段测试2" + record['slice_type'],
                'dynamic_fields_3': "动态字段测试3" + source,
            })

        # Insert data in batches to reduce the number of insert operations
        if len(documents) >= 500:  # Adjust batch size as needed
            client.insert(new_collection, documents, 20, partition_name)
            documents = []  # Clear the list after insertion
    if kb_id is not None:
        kb_ids.append(kb_id)
    # if kb_id == 0:
    #     print(source)
    #     if source == '':
    #         print(source)
    insert_counts.append(total_docs)
    # Insert any remaining documents
    if documents:
        client.insert(new_collection, documents, 20, partition_name)

    logger.info(f"Finished fetching and inserting documents,{collection.name}: {total_docs}.")
    return total_docs

except Exception as e:
    logger.error(f"Error fetching documents: {e}", exc_info=True)
    raise
finally:
    if 'iterator' in locals():
        iterator.close()

def create_schema(client):
schema = client.create_schema(enable_dynamic_field=True) # 在创建集合时未设置该值，追加插入数据时报错。
tokenizer_params = {
"tokenizer": "jieba"
}
# 字段定义
schema.add_field("pk", DataType.INT64, is_primary=True, auto_id=True)
schema.add_field("text", DataType.VARCHAR, max_length=65535, analyzer_params=tokenizer_params,
enable_analyzer=True)
schema.add_field("vector", DataType.FLOAT_VECTOR, dim=1792, mmap_enable=True)
schema.add_field("vector_bm25", datatype=DataType.SPARSE_FLOAT_VECTOR)
bm25_function = Function(
name="text_bm25_emb", # Function name
input_field_names=["text"], # Name of the VARCHAR field containing raw text data
output_field_names=["vector_bm25"],
# Name of the SPARSE_FLOAT_VECTOR field reserved to store generated embeddings
function_type=FunctionType.BM25,
)

schema.add_function(bm25_function)
schema.add_field("source", DataType.VARCHAR, max_length=65535)
schema.add_field("seq_num", DataType.INT32)
schema.add_field("kb_id", DataType.INT32)
schema.add_field("slice_id", DataType.VARCHAR, max_length=50)
schema.add_field("slice_type", DataType.VARCHAR, max_length=50)
schema.add_field("full_text", DataType.VARCHAR, max_length=65535)
schema.add_field("doc_type", DataType.VARCHAR, max_length=50)
return schema

def process_collection(old_collection_name, client):
# 加载原有集合
old_collection = Collection(name=old_collection_name, using="source")

# 获取集合中数据总量
num_entities_old = old_collection.num_entities
logger.info(f"Total number of entities in collection {old_collection_name}: {num_entities_old}")
# 计算累计数据量
num_entities.append(num_entities_old)

pattern = r'^(vectordb_\d+)(_api)?$'
match = re.match(pattern, old_collection_name)
if match:
    num = int(match.group(1).split('_')[-1]) % 1023
    partition_name = f"partition_{num}" if num != 0 else "_default"
    if match.group(2) == '_api':
        doc_type = 'api'
    else:
        doc_type = 'doc'


    fetch_documents_from_collection(old_collection, new_collection_name, client, partition_name, doc_type)

def main():

time_start = time.time()
new_client = MilvusClient(uri=uri, db_name=new_db, user="root", password="Milvus")

if is_create_collection:

    # 新 schema
    schema = create_schema(new_client)
    # 如果新集合已经存在，删除它
    if new_client.has_collection(new_collection_name):
        logger.info(f"Dropping existing collection: {new_collection_name}")
        new_client.drop_collection(new_collection_name)

    new_client.create_collection(new_collection_name, schema=schema, enable_dynamic_field=False)    # enable_dynamic_field值失效

    logger.info(f"Created new collection: {new_collection_name} in database target")

    index_params = new_client.prepare_index_params()

    vector_params = {"M": 8, "efConstruction": 64}

    index_params.add_index(
        field_name="vector",
        index_type="HNSW",
        metric_type="IP",
        index_name="vector",
        index_params=vector_params
    )

    # BM25稀疏向量索引
    index_params.add_index(
        field_name="vector_bm25",
        index_type="SPARSE_INVERTED_INDEX",
        metric_type="BM25",
        index_name="vector_bm25"
    )

    new_client.create_index(new_collection_name, index_params)

    new_client.alter_index_properties(
        collection_name=new_collection_name,
        index_name="vector",
        properties={"mmap.enabled": True}
    )

    logger.info(f"已按照定义建好索引")

collection_names = utility.list_collections(using="source")
collection_names = ['vectordb_101']  # 你可以在这里指定更多的集合

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    # 提交任务到线程池
    futures = [executor.submit(process_collection, collection, new_client) for collection in collection_names]

    # 跟踪进度
    completed = 0
    total = len(collection_names)

    # 确保所有线程完成任务
    for future in concurrent.futures.as_completed(futures):
        try:
            # 获取结果，如果有异常会在这里抛出
            future.result()
        except Exception as exc:
            logger.info(f'{collection_names[futures.index(future)]} generated an exception: {exc}')
        else:
            logger.info(f'{collection_names[futures.index(future)]} completed successfully.')

        # 更新进度
        completed += 1
        logger.info(f'Progress: {completed}/{total} completed', end='\r')


logger.info('\nAll tasks completed.')
logger.info(f'Total entities: {sum(num_entities)}, Insert counts: {sum(insert_counts)}')
logger.info(f'Total time: {time.time()-time_start} seconds')
# ids_binary_string = generate_binary_string(kb_ids)
# # 写入文件
# with open('kb_ids.txt', 'w') as f:
#     f.write(str(kb_ids))
#     f.write('\n')
#     f.write(ids_binary_string)

if name == "main":
# from pymilvus import db
#
# connections.connect(host=host, port=port, user=user, password=password)
# db.create_database(target_db)
main()

yhmo · 2025-04-15T09:28:31Z

yhmo
Apr 15, 2025
Collaborator

在2.4的文档里，建表方式有两种：

Quick setup
无需输入CollectionSchema，直接建一张有id和vector两个字段的表。这种方式下，create_collection()的enable_dynamic_field参数是有效的。

client.create_collection(
    collection_name="quick_setup",
    dimension=5,
    enable_dynamic_field=True,
)

Customized setup
这种方式需要传入一个CollectionSchema，而只有CollectionSchema的enable_dynamic_field参数才能指定是否开启

schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=True,
)

schema.add_field(field_name="my_id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="my_vector", datatype=DataType.FLOAT_VECTOR, dim=5)

client.create_collection(
    collection_name="customized_setup_1",
    schema=schema,
)

你用的Customized setup，所以create_collection()的enable_dynamic_field参数不生效。我觉得这里，当create_collection()的enable_dynamic_field和CollectionSchema的enable_dynamic_field不一致的时候，pymilvus应该给一个报错，否则确实很容易误导。

没有开启enable_dynamic_field的表，没法修改。所以只能再建一个开启了enable_dynamic_field新表来导入旧表的数据。

1 reply

IeohMingChan Apr 15, 2025
Author

感谢解答，我使用的是v2.5.6版本，希望下版本可以修复该问题，官方指南确实有点误导，现在重写数据成本太大了

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

使用自定义schema时在create_collection中创建动态字段失效 #41316

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 1 comment 1 reply

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

使用自定义schema时在create_collection中创建动态字段失效 #41316

Uh oh!

Uh oh!

IeohMingChan Apr 15, 2025

相关代码

Replies: 1 comment · 1 reply

Uh oh!

yhmo Apr 15, 2025 Collaborator

Uh oh!

IeohMingChan Apr 15, 2025 Author

IeohMingChan
Apr 15, 2025

Replies: 1 comment 1 reply

yhmo
Apr 15, 2025
Collaborator

IeohMingChan Apr 15, 2025
Author