From ee016ffea0e85cb72825d45d2811b0d5dbe34ee8 Mon Sep 17 00:00:00 2001 From: Vitaly Isaev Date: Tue, 16 Jul 2024 15:55:00 +0300 Subject: [PATCH 01/56] Merge #5984 #6257 #6298 #6471 #6484 #6703 (#6724) Co-authored-by: Timur Sufiyanov --- .../external_sources/external_data_source.cpp | 8 ++ .../libs/actors/clusters_from_connections.cpp | 12 +++ ydb/core/fq/libs/actors/database_resolver.cpp | 56 ++++++++++++- .../libs/actors/ut/database_resolver_ut.cpp | 81 ++++++++++++++++++- ydb/core/fq/libs/common/util.cpp | 11 +++ ydb/core/fq/libs/compute/common/config.h | 1 + .../actors/query_utils.cpp | 13 +++ .../fq/libs/control_plane_proxy/utils/utils.h | 3 + .../request_validators.cpp | 31 +++++-- .../ydb_control_plane_storage_queries.cpp | 2 + .../mdb_endpoint_generator.cpp | 12 ++- .../kqp/provider/yql_kikimr_gateway_ut.cpp | 2 + .../ut_external_data_source.cpp | 4 + ydb/public/api/protos/draft/fq.proto | 10 +++ 14 files changed, 233 insertions(+), 13 deletions(-) diff --git a/ydb/core/external_sources/external_data_source.cpp b/ydb/core/external_sources/external_data_source.cpp index e11c7cce2446..d44c8ca6dbb1 100644 --- a/ydb/core/external_sources/external_data_source.cpp +++ b/ydb/core/external_sources/external_data_source.cpp @@ -36,6 +36,10 @@ struct TExternalDataSource : public IExternalSource { ythrow TExternalSourceException() << "Only external table supports parameters"; } + bool IsRDBMSDataSource(const TProtoStringType& sourceType) const { + return IsIn({"Greenplum", "PostgreSQL", "MySQL", "MsSQLServer", "Clickhouse"}, sourceType); + } + virtual void ValidateExternalDataSource(const TString& externalDataSourceDescription) const override { NKikimrSchemeOp::TExternalDataSourceDescription proto; if (!proto.ParseFromString(externalDataSourceDescription)) { @@ -49,6 +53,10 @@ struct TExternalDataSource : public IExternalSource { ythrow TExternalSourceException() << "Unsupported property: " << key; } + if (IsRDBMSDataSource(proto.GetSourceType()) && !proto.GetProperties().GetProperties().contains("database_name")){ + ythrow TExternalSourceException() << proto.GetSourceType() << " source must provide database_name"; + } + ValidateHostname(HostnamePatterns, proto.GetLocation()); } diff --git a/ydb/core/fq/libs/actors/clusters_from_connections.cpp b/ydb/core/fq/libs/actors/clusters_from_connections.cpp index 36a5fe05f2f5..7b2d8187fd1e 100644 --- a/ydb/core/fq/libs/actors/clusters_from_connections.cpp +++ b/ydb/core/fq/libs/actors/clusters_from_connections.cpp @@ -283,6 +283,18 @@ void AddClustersFromConnections( clusters.emplace(connectionName, GenericProviderName); break; } + case FederatedQuery::ConnectionSetting::kMysqlCluster: { + FillGenericClusterConfig( + common, + *gatewaysConfig.MutableGeneric()->AddClusterMapping(), + conn.content().setting().mysql_cluster(), + connectionName, + NYql::NConnector::NApi::EDataSourceKind::MYSQL, + authToken, + accountIdSignatures); + clusters.emplace(connectionName, GenericProviderName); + break; + } // Do not replace with default. Adding a new connection should cause a compilation error case FederatedQuery::ConnectionSetting::CONNECTION_NOT_SET: diff --git a/ydb/core/fq/libs/actors/database_resolver.cpp b/ydb/core/fq/libs/actors/database_resolver.cpp index d098a1ed76a4..74ba4e3ddf89 100644 --- a/ydb/core/fq/libs/actors/database_resolver.cpp +++ b/ydb/core/fq/libs/actors/database_resolver.cpp @@ -319,7 +319,7 @@ class TDatabaseResolver: public TActor // There are two kinds of managed YDBs: serverless and dedicated. // While working with dedicated databases, we have to use underlay network. // That's why we add `u-` prefix to database fqdn. - if (databaseInfo.GetMap().contains("dedicatedDatabase")) { + if (databaseInfo.GetMap().contains("storageConfig")) { endpoint = "u-" + endpoint; host = "u-" + host; } @@ -335,7 +335,7 @@ class TDatabaseResolver: public TActor { auto ret = ydbParser(databaseInfo, mdbEndpointGenerator, useTls, protocol); // TODO: Take explicit field from MVP - bool isDedicatedDb = databaseInfo.GetMap().contains("dedicatedDatabase"); + bool isDedicatedDb = databaseInfo.GetMap().contains("storageConfig"); if (!isDedicatedDb && ret.Endpoint.StartsWith("ydb.")) { // Replace "ydb." -> "yds." ret.Endpoint[2] = 's'; @@ -457,6 +457,56 @@ class TDatabaseResolver: public TActor endpoint = mdbEndpointGenerator->ToEndpoint(params); + return TDatabaseDescription{"", endpoint.first, endpoint.second, "", useTls}; + }; + Parsers[NYql::EDatabaseType::MySQL] = []( + NJson::TJsonValue& databaseInfo, + const NYql::IMdbEndpointGenerator::TPtr& mdbEndpointGenerator, + bool useTls, + NConnector::NApi::EProtocol protocol + ) { + NYql::IMdbEndpointGenerator::TEndpoint endpoint; + TVector aliveHosts; + + const auto& hostsArray = databaseInfo.GetMap().at("hosts").GetArraySafe(); + + for (const auto& host : hostsArray) { + const auto& hostMap = host.GetMap(); + + if (!hostMap.contains("services")) { + // indicates that cluster is down + continue; + } + + const auto& servicesArray = hostMap.at("services").GetArraySafe(); + + // check if all services of a particular host are alive + const bool alive = std::all_of( + servicesArray.begin(), + servicesArray.end(), + [](const auto& service) { + return service["health"].GetString() == "ALIVE"; + } + ); + + if (alive) { + aliveHosts.push_back(host["name"].GetString()); + } + } + + if (aliveHosts.empty()) { + ythrow TCodeLineException(TIssuesIds::INTERNAL_ERROR) << "No ALIVE MySQL hosts found"; + } + + NYql::IMdbEndpointGenerator::TParams params = { + .DatabaseType = NYql::EDatabaseType::MySQL, + .MdbHost = aliveHosts[std::rand() % static_cast(aliveHosts.size())], + .UseTls = useTls, + .Protocol = protocol, + }; + + endpoint = mdbEndpointGenerator->ToEndpoint(params); + return TDatabaseDescription{"", endpoint.first, endpoint.second, "", useTls}; }; } @@ -538,7 +588,7 @@ class TDatabaseResolver: public TActor url = TUrlBuilder(ev->Get()->YdbMvpEndpoint + "/database") .AddUrlParam("databaseId", databaseId) .Build(); - } else if (IsIn({NYql::EDatabaseType::ClickHouse, NYql::EDatabaseType::PostgreSQL}, databaseType)) { + } else if (IsIn({NYql::EDatabaseType::ClickHouse, NYql::EDatabaseType::PostgreSQL, NYql::EDatabaseType::MySQL}, databaseType)) { YQL_ENSURE(ev->Get()->MdbGateway, "empty MDB Gateway"); url = TUrlBuilder( ev->Get()->MdbGateway + "/managed-" + NYql::DatabaseTypeLowercase(databaseType) + "/v1/clusters/") diff --git a/ydb/core/fq/libs/actors/ut/database_resolver_ut.cpp b/ydb/core/fq/libs/actors/ut/database_resolver_ut.cpp index 2cfc32baa9c4..7295d7beb804 100644 --- a/ydb/core/fq/libs/actors/ut/database_resolver_ut.cpp +++ b/ydb/core/fq/libs/actors/ut/database_resolver_ut.cpp @@ -243,7 +243,7 @@ Y_UNIT_TEST_SUITE(TDatabaseResolverTests) { R"( { "endpoint":"grpcs://lb.etnbrtlini51k7cinbdr.ydb.mdb.yandexcloud.net:2135/?database=/ru-central1/b1gtl2kg13him37quoo6/etn021us5r9rhld1vgbh", - "dedicatedDatabase":{"resuorcePresetId": "medium"} + "storageConfig":{"storageSizeLimit":107374182400} })", NYql::TDatabaseResolverResponse::TDatabaseDescription{ TString{"u-lb.etnbrtlini51k7cinbdr.ydb.mdb.yandexcloud.net:2135"}, @@ -286,7 +286,7 @@ Y_UNIT_TEST_SUITE(TDatabaseResolverTests) { R"( { "endpoint":"grpcs://lb.etn021us5r9rhld1vgbh.ydb.mdb.yandexcloud.net:2135/?database=/ru-central1/b1g7jdjqd07qg43c4fmp/etn021us5r9rhld1vgbh", - "dedicatedDatabase":{"resourcePresetId": "medium"} + "storageConfig":{"storageSizeLimit":107374182400} })", NYql::TDatabaseResolverResponse::TDatabaseDescription{ TString{"u-lb.etn021us5r9rhld1vgbh.ydb.mdb.yandexcloud.net:2135"}, @@ -474,6 +474,7 @@ Y_UNIT_TEST_SUITE(TDatabaseResolverTests) { issues ); } + Y_UNIT_TEST(Greenplum_MasterNode) { Test( NYql::EDatabaseType::Greenplum, @@ -505,7 +506,7 @@ Y_UNIT_TEST_SUITE(TDatabaseResolverTests) { TString(""), true}, {}); - } + } Y_UNIT_TEST(Greenplum_PermissionDenied) { NYql::TIssues issues{ @@ -536,7 +537,79 @@ Y_UNIT_TEST_SUITE(TDatabaseResolverTests) { )", NYql::TDatabaseResolverResponse::TDatabaseDescription{}, issues); - } + } + + Y_UNIT_TEST(MySQL) { + Test( + NYql::EDatabaseType::MySQL, + NYql::NConnector::NApi::EProtocol::NATIVE, + "https://mdb.api.cloud.yandex.net:443/managed-mysql/v1/clusters/etn021us5r9rhld1vgbh/hosts", + "200", + R"({ + "hosts": [ + { + "services": [ + { + "type": "POOLER", + "health": "ALIVE" + }, + { + "type": "MYSQL", + "health": "ALIVE" + } + ], + "name": "rc1b-eyt6dtobu96rwydq.mdb.yandexcloud.net", + "clusterId": "c9qb2bjghs8onbncpamk", + "zoneId": "ru-central1-b", + "role": "MASTER", + "health": "ALIVE" + } + ] + })", + NYql::TDatabaseResolverResponse::TDatabaseDescription{ + TString{""}, + TString{"rc1b-eyt6dtobu96rwydq.db.yandex.net"}, + 3306, + TString(""), + true + }, + {}); + } + + Y_UNIT_TEST(MySQL_PermissionDenied) { + NYql::TIssues issues{ + NYql::TIssue( + TStringBuilder{} << MakeErrorPrefix( + "mdb.api.cloud.yandex.net:443", + "/managed-mysql/v1/clusters/etn021us5r9rhld1vgbh/hosts", + "etn021us5r9rhld1vgbh", + NYql::EDatabaseType::MySQL + ) << NoPermissionStr + ) + }; + + Test( + NYql::EDatabaseType::MySQL, + NYql::NConnector::NApi::EProtocol::NATIVE, + "https://mdb.api.cloud.yandex.net:443/managed-mysql/v1/clusters/etn021us5r9rhld1vgbh/hosts", + "403", + R"( + { + "code": 7, + "message": "Permission denied", + "details": [ + { + "@type": "type.googleapis.com/google.rpc.RequestInfo", + "requestId": "a943c092-d596-4e0e-ae7b-1f67f9d8164e" + } + ] + } + )", + NYql::TDatabaseResolverResponse::TDatabaseDescription{}, + issues + ); + } + Y_UNIT_TEST(DataStreams_PermissionDenied) { NYql::TIssues issues{ diff --git a/ydb/core/fq/libs/common/util.cpp b/ydb/core/fq/libs/common/util.cpp index 433709568eb1..61d2ea43bbda 100644 --- a/ydb/core/fq/libs/common/util.cpp +++ b/ydb/core/fq/libs/common/util.cpp @@ -129,6 +129,9 @@ TString ExtractServiceAccountId(const FederatedQuery::ConnectionSetting& setting case FederatedQuery::ConnectionSetting::kGreenplumCluster: { return GetServiceAccountId(setting.greenplum_cluster().auth()); } + case FederatedQuery::ConnectionSetting::kMysqlCluster: { + return GetServiceAccountId(setting.mysql_cluster().auth()); + } // Do not replace with default. Adding a new connection should cause a compilation error case FederatedQuery::ConnectionSetting::CONNECTION_NOT_SET: break; @@ -162,6 +165,8 @@ TMaybe GetLogin(const FederatedQuery::ConnectionSetting& setting) { return setting.postgresql_cluster().login(); case FederatedQuery::ConnectionSetting::kGreenplumCluster: return setting.greenplum_cluster().login(); + case FederatedQuery::ConnectionSetting::kMysqlCluster: + return setting.mysql_cluster().login(); } } @@ -183,6 +188,8 @@ TMaybe GetPassword(const FederatedQuery::ConnectionSetting& setting) { return setting.postgresql_cluster().password(); case FederatedQuery::ConnectionSetting::kGreenplumCluster: return setting.greenplum_cluster().password(); + case FederatedQuery::ConnectionSetting::kMysqlCluster: + return setting.mysql_cluster().password(); } } @@ -204,6 +211,8 @@ EYdbComputeAuth GetYdbComputeAuthMethod(const FederatedQuery::ConnectionSetting& return GetBasicAuthMethod(setting.postgresql_cluster().auth()); case FederatedQuery::ConnectionSetting::kGreenplumCluster: return GetBasicAuthMethod(setting.greenplum_cluster().auth()); + case FederatedQuery::ConnectionSetting::kMysqlCluster: + return GetBasicAuthMethod(setting.mysql_cluster().auth()); } } @@ -223,6 +232,8 @@ FederatedQuery::IamAuth GetAuth(const FederatedQuery::Connection& connection) { return connection.content().setting().postgresql_cluster().auth(); case FederatedQuery::ConnectionSetting::kGreenplumCluster: return connection.content().setting().greenplum_cluster().auth(); + case FederatedQuery::ConnectionSetting::kMysqlCluster: + return connection.content().setting().mysql_cluster().auth(); case FederatedQuery::ConnectionSetting::CONNECTION_NOT_SET: return FederatedQuery::IamAuth{}; } diff --git a/ydb/core/fq/libs/compute/common/config.h b/ydb/core/fq/libs/compute/common/config.h index 5817e2d94d6f..dec5b1a84f57 100644 --- a/ydb/core/fq/libs/compute/common/config.h +++ b/ydb/core/fq/libs/compute/common/config.h @@ -165,6 +165,7 @@ class TComputeConfig { case FederatedQuery::ConnectionSetting::kClickhouseCluster: case FederatedQuery::ConnectionSetting::kPostgresqlCluster: case FederatedQuery::ConnectionSetting::kGreenplumCluster: + case FederatedQuery::ConnectionSetting::kMysqlCluster: case FederatedQuery::ConnectionSetting::kYdbDatabase: return true; case FederatedQuery::ConnectionSetting::kDataStreams: diff --git a/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp b/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp index 728bb2081131..1a781f374205 100644 --- a/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp +++ b/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp @@ -247,6 +247,19 @@ TString MakeCreateExternalDataSourceQuery( "use_tls"_a = common.GetDisableSslForGenericDataSources() ? "false" : "true", "schema"_a = gpschema ? ", SCHEMA=" + EncloseAndEscapeString(gpschema, '"') : TString{}); + } + case FederatedQuery::ConnectionSetting::kMysqlCluster: { + properties = fmt::format( + R"( + SOURCE_TYPE="MySQL", + MDB_CLUSTER_ID={mdb_cluster_id}, + DATABASE_NAME={database_name}, + USE_TLS="{use_tls}" + )", + "mdb_cluster_id"_a = EncloseAndEscapeString(connectionContent.setting().mysql_cluster().database_id(), '"'), + "database_name"_a = EncloseAndEscapeString(connectionContent.setting().mysql_cluster().database_name(), '"'), + "use_tls"_a = common.GetDisableSslForGenericDataSources() ? "false" : "true"); + } break; } diff --git a/ydb/core/fq/libs/control_plane_proxy/utils/utils.h b/ydb/core/fq/libs/control_plane_proxy/utils/utils.h index 1e6a531e2bc0..2eb9702387b8 100644 --- a/ydb/core/fq/libs/control_plane_proxy/utils/utils.h +++ b/ydb/core/fq/libs/control_plane_proxy/utils/utils.h @@ -34,6 +34,9 @@ TString ExtractServiceAccountIdWithConnection(const T& setting) { case FederatedQuery::ConnectionSetting::kGreenplumCluster: { return GetServiceAccountId(setting.greenplum_cluster().auth()); } + case FederatedQuery::ConnectionSetting::kMysqlCluster: { + return GetServiceAccountId(setting.mysql_cluster().auth()); + } // Do not replace with default. Adding a new connection should cause a compilation error case FederatedQuery::ConnectionSetting::CONNECTION_NOT_SET: break; diff --git a/ydb/core/fq/libs/control_plane_storage/request_validators.cpp b/ydb/core/fq/libs/control_plane_storage/request_validators.cpp index 03cf908e52de..996e9d7defa6 100644 --- a/ydb/core/fq/libs/control_plane_storage/request_validators.cpp +++ b/ydb/core/fq/libs/control_plane_storage/request_validators.cpp @@ -20,10 +20,15 @@ void ValidateGenericConnectionSetting( } if (!connection.database_id() && !(connection.host() && connection.port())) { - auto msg = TStringBuilder() << "content.setting.clickhouse_cluster.{database_id or host,port} field is not specified"; - issues.AddIssue( MakeErrorIssue(TIssuesIds::BAD_REQUEST,msg)); + auto msg = TStringBuilder() << "content.setting." << dataSourceKind << "_cluster.{database_id or host,port} field is not specified"; + issues.AddIssue(MakeErrorIssue(TIssuesIds::BAD_REQUEST,msg)); } + if (!connection.database_name()) { + auto msg = TStringBuilder() << "content.setting." << dataSourceKind << "_cluster.database_name field is not specified"; + issues.AddIssue(MakeErrorIssue(TIssuesIds::BAD_REQUEST,msg)); + } + if (!connection.login()) { auto msg = TStringBuilder() << "content.setting." << dataSourceKind << "_cluster.login is not specified"; issues.AddIssue(MakeErrorIssue(TIssuesIds::BAD_REQUEST, msg)); @@ -70,17 +75,33 @@ NYql::TIssues ValidateConnectionSetting( break; } case FederatedQuery::ConnectionSetting::kGreenplumCluster: { - const FederatedQuery::GreenplumCluster database = setting.greenplum_cluster(); - if (!database.has_auth() || database.auth().identity_case() == FederatedQuery::IamAuth::IDENTITY_NOT_SET) { + const FederatedQuery::GreenplumCluster& greenplumCluster = setting.greenplum_cluster(); + + if (!greenplumCluster.has_auth() || greenplumCluster.auth().identity_case() == FederatedQuery::IamAuth::IDENTITY_NOT_SET) { issues.AddIssue(MakeErrorIssue(TIssuesIds::BAD_REQUEST, "content.setting.greenplum_database.auth field is not specified")); } + if (greenplumCluster.auth().identity_case() == FederatedQuery::IamAuth::kCurrentIam && disableCurrentIam) { + issues.AddIssue(MakeErrorIssue(TIssuesIds::BAD_REQUEST, "current iam authorization is disabled")); + } + + if (!greenplumCluster.database_id() && !greenplumCluster.database_name()) { + issues.AddIssue(MakeErrorIssue(TIssuesIds::BAD_REQUEST, "content.setting.greenplum_database.{database_id or database_name} field is not specified")); + } + break; + } + case FederatedQuery::ConnectionSetting::kMysqlCluster: { + const FederatedQuery::MySQLCluster database = setting.mysql_cluster(); + if (!database.has_auth() || database.auth().identity_case() == FederatedQuery::IamAuth::IDENTITY_NOT_SET) { + issues.AddIssue(MakeErrorIssue(TIssuesIds::BAD_REQUEST, "content.setting.mysql_database.auth field is not specified")); + } + if (database.auth().identity_case() == FederatedQuery::IamAuth::kCurrentIam && disableCurrentIam) { issues.AddIssue(MakeErrorIssue(TIssuesIds::BAD_REQUEST, "current iam authorization is disabled")); } if (!database.database_id() && !database.database_name()) { - issues.AddIssue(MakeErrorIssue(TIssuesIds::BAD_REQUEST, "content.setting.greenplum_database.{database_id or database_name} field is not specified")); + issues.AddIssue(MakeErrorIssue(TIssuesIds::BAD_REQUEST, "content.setting.mysql_database.{database_id or database_name} field is not specified")); } break; } diff --git a/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_queries.cpp b/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_queries.cpp index 62a2ad749ae1..ae371c52b811 100644 --- a/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_queries.cpp +++ b/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_queries.cpp @@ -41,6 +41,8 @@ FederatedQuery::IamAuth::IdentityCase GetIamAuth(const FederatedQuery::Connectio return setting.postgresql_cluster().auth().identity_case(); case FederatedQuery::ConnectionSetting::kGreenplumCluster: return setting.greenplum_cluster().auth().identity_case(); + case FederatedQuery::ConnectionSetting::kMysqlCluster: + return setting.mysql_cluster().auth().identity_case(); case FederatedQuery::ConnectionSetting::CONNECTION_NOT_SET: return FederatedQuery::IamAuth::IDENTITY_NOT_SET; } diff --git a/ydb/core/fq/libs/db_id_async_resolver_impl/mdb_endpoint_generator.cpp b/ydb/core/fq/libs/db_id_async_resolver_impl/mdb_endpoint_generator.cpp index ada5a7709fc7..634d835070df 100644 --- a/ydb/core/fq/libs/db_id_async_resolver_impl/mdb_endpoint_generator.cpp +++ b/ydb/core/fq/libs/db_id_async_resolver_impl/mdb_endpoint_generator.cpp @@ -18,6 +18,8 @@ namespace NFq { constexpr ui32 GREENPLUM_PORT = 6432; + constexpr ui32 MYSQL_PORT = 3306; + // TMdbEndpointGeneratorLegacy implements behavior required by YQL legacy ClickHouse provider class TMdbEndpointGeneratorLegacy: public NYql::IMdbEndpointGenerator { TEndpoint ToEndpoint(const NYql::IMdbEndpointGenerator::TParams& params) const override { @@ -76,13 +78,21 @@ namespace NFq { ythrow yexception() << "Unexpected protocol for PostgreSQL " << NYql::NConnector::NApi::EProtocol_Name(params.Protocol); } case NYql::EDatabaseType::Greenplum: - // https://cloud.yandex.ru/docs/managed-postgresql/operations/connect + // https://cloud.yandex.ru/docs/managed-greenplum/operations/connect switch (params.Protocol) { case NYql::NConnector::NApi::EProtocol::NATIVE: return TEndpoint(fixedHost, GREENPLUM_PORT); default: ythrow yexception() << "Unexpected protocol for Greenplum: " << NYql::NConnector::NApi::EProtocol_Name(params.Protocol); } + case NYql::EDatabaseType::MySQL: + // https://cloud.yandex.ru/docs/managed-mysql/operations/connect + switch (params.Protocol) { + case NYql::NConnector::NApi::EProtocol::NATIVE: + return TEndpoint(fixedHost, MYSQL_PORT); + default: + ythrow yexception() << "Unexpected protocol for MySQL: " << NYql::NConnector::NApi::EProtocol_Name(params.Protocol); + } default: ythrow yexception() << "Unexpected database type: " << ToString(params.DatabaseType); }; diff --git a/ydb/core/kqp/provider/yql_kikimr_gateway_ut.cpp b/ydb/core/kqp/provider/yql_kikimr_gateway_ut.cpp index b16d43a67527..f4ad83394ee2 100644 --- a/ydb/core/kqp/provider/yql_kikimr_gateway_ut.cpp +++ b/ydb/core/kqp/provider/yql_kikimr_gateway_ut.cpp @@ -422,6 +422,7 @@ Y_UNIT_TEST_SUITE(KikimrIcGateway) { LOCATION="my-bucket", AUTH_METHOD="BASIC", LOGIN="mylogin", + DATABASE_NAME="postgres", PASSWORD_SECRET_NAME=")" << secretId << R"(" );)"; auto result = session.ExecuteSchemeQuery(query).GetValueSync(); @@ -458,6 +459,7 @@ Y_UNIT_TEST_SUITE(KikimrIcGateway) { SERVICE_ACCOUNT_ID="mysa", SERVICE_ACCOUNT_SECRET_NAME=")" << secretSaId << R"(", LOGIN="mylogin", + DATABASE_NAME="postgres", PASSWORD_SECRET_NAME=")" << secretPasswordId << R"(" );)"; auto result = session.ExecuteSchemeQuery(query).GetValueSync(); diff --git a/ydb/core/tx/schemeshard/ut_external_data_source/ut_external_data_source.cpp b/ydb/core/tx/schemeshard/ut_external_data_source/ut_external_data_source.cpp index 9d89bbfdf199..92faf2827c6c 100644 --- a/ydb/core/tx/schemeshard/ut_external_data_source/ut_external_data_source.cpp +++ b/ydb/core/tx/schemeshard/ut_external_data_source/ut_external_data_source.cpp @@ -46,6 +46,10 @@ Y_UNIT_TEST_SUITE(TExternalDataSourceTest) { key: "mdb_cluster_id", value: "id" } + Properties { + key: "database_name", + value: "postgres" + } } )", {NKikimrScheme::StatusAccepted}); diff --git a/ydb/public/api/protos/draft/fq.proto b/ydb/public/api/protos/draft/fq.proto index 709e0a963a07..d1713334a047 100644 --- a/ydb/public/api/protos/draft/fq.proto +++ b/ydb/public/api/protos/draft/fq.proto @@ -501,6 +501,14 @@ message GreenplumCluster { IamAuth auth = 6; } +message MySQLCluster { + string database_id = 1 [(Ydb.length).le = 1024]; + string database_name = 2 [(Ydb.length).le = 1024]; + string login = 3 [(Ydb.length).le = 1024, (Ydb.sensitive) = true]; + string password = 4 [(Ydb.length).le = 1024, (Ydb.sensitive) = true]; + IamAuth auth = 5; +} + message ConnectionSetting { enum ConnectionType { CONNECTION_TYPE_UNSPECIFIED = 0; @@ -511,6 +519,7 @@ message ConnectionSetting { MONITORING = 5; POSTGRESQL_CLUSTER = 6; GREENPLUM_CLUSTER = 7; + MYSQL_CLUSTER = 8; } oneof connection { @@ -521,6 +530,7 @@ message ConnectionSetting { Monitoring monitoring = 5; PostgreSQLCluster postgresql_cluster = 6; GreenplumCluster greenplum_cluster = 7; + MySQLCluster mysql_cluster = 8; } } From 9577402d4da2e276d0b7c50a337b81740caba0de Mon Sep 17 00:00:00 2001 From: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:29:43 +0300 Subject: [PATCH 02/56] YQ-3439 added retries for CURLE_COULDNT_RESOLVE_HOST (#6750) --- ydb/core/external_sources/object_storage.cpp | 3 +- ydb/core/kqp/host/kqp_host.cpp | 1 + .../yql_http_default_retry_policy.cpp | 3 +- .../s3/actors/yql_s3_raw_read_actor.cpp | 1 + .../providers/s3/actors/yql_s3_read_actor.cpp | 1 + .../providers/s3/actors/yql_s3_read_actor.h | 1 + .../s3/actors/yql_s3_source_queue.cpp | 6 +++ .../providers/s3/actors/yql_s3_source_queue.h | 1 + .../s3/object_listers/yql_s3_list.cpp | 13 ++++--- .../providers/s3/object_listers/yql_s3_list.h | 2 + .../s3/provider/yql_s3_dq_integration.cpp | 1 + .../s3/provider/yql_s3_io_discovery.cpp | 1 + .../s3/provider/yql_s3_listing_strategy.cpp | 39 ++++++++++++------- .../s3/provider/yql_s3_listing_strategy.h | 1 + .../providers/s3/provider/yql_s3_provider.h | 2 + 15 files changed, 55 insertions(+), 21 deletions(-) diff --git a/ydb/core/external_sources/object_storage.cpp b/ydb/core/external_sources/object_storage.cpp index 5c679a07b9d8..36a87a69b8cd 100644 --- a/ydb/core/external_sources/object_storage.cpp +++ b/ydb/core/external_sources/object_storage.cpp @@ -304,7 +304,8 @@ struct TObjectStorageExternalSource : public IExternalSource { } auto httpGateway = NYql::IHTTPGateway::Make(); - auto s3Lister = NYql::NS3Lister::MakeS3Lister(httpGateway, NYql::NS3Lister::TListingRequest{ + auto httpRetryPolicy = NYql::GetHTTPDefaultRetryPolicy(NYql::THttpRetryPolicyOptions{.RetriedCurlCodes = NYql::FqRetriedCurlCodes()}); + auto s3Lister = NYql::NS3Lister::MakeS3Lister(httpGateway, httpRetryPolicy, NYql::NS3Lister::TListingRequest{ .Url = meta->DataSourceLocation, .AuthInfo = authInfo, .Pattern = meta->TableLocation, diff --git a/ydb/core/kqp/host/kqp_host.cpp b/ydb/core/kqp/host/kqp_host.cpp index c54f1a26cc44..01c8b51843af 100644 --- a/ydb/core/kqp/host/kqp_host.cpp +++ b/ydb/core/kqp/host/kqp_host.cpp @@ -1695,6 +1695,7 @@ class TKqpHost : public IKqpHost { state->Configuration->AllowAtomicUploadCommit = queryType == EKikimrQueryType::Script; state->Configuration->Init(FederatedQuerySetup->S3GatewayConfig, TypesCtx); state->Gateway = FederatedQuerySetup->HttpGateway; + state->GatewayRetryPolicy = NYql::GetHTTPDefaultRetryPolicy(NYql::THttpRetryPolicyOptions{.RetriedCurlCodes = NYql::FqRetriedCurlCodes()}); state->ExecutorPoolId = AppData()->UserPoolId; auto dataSource = NYql::CreateS3DataSource(state); diff --git a/ydb/library/yql/providers/common/http_gateway/yql_http_default_retry_policy.cpp b/ydb/library/yql/providers/common/http_gateway/yql_http_default_retry_policy.cpp index 994ed88d5a76..6fd330caf4ca 100644 --- a/ydb/library/yql/providers/common/http_gateway/yql_http_default_retry_policy.cpp +++ b/ydb/library/yql/providers/common/http_gateway/yql_http_default_retry_policy.cpp @@ -29,7 +29,8 @@ std::unordered_set FqRetriedCurlCodes() { CURLE_SEND_ERROR, CURLE_RECV_ERROR, CURLE_NO_CONNECTION_AVAILABLE, - CURLE_GOT_NOTHING + CURLE_GOT_NOTHING, + CURLE_COULDNT_RESOLVE_HOST }; } diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp index 2cc7214d54c9..749c86b9db44 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp @@ -111,6 +111,7 @@ class TS3ReadActor : public NActors::TActorBootstrapped, public ID FileQueueBatchSizeLimit, FileQueueBatchObjectCountLimit, Gateway, + RetryPolicy, Url, AuthInfo, Pattern, diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp index 9c827e32c1dd..358448c2d451 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp @@ -1319,6 +1319,7 @@ class TS3StreamReadActor : public TActorBootstrapped, public FileQueueBatchSizeLimit, FileQueueBatchObjectCountLimit, Gateway, + RetryPolicy, Url, AuthInfo, Pattern, diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h index 447b21c672d6..69de502f94e4 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h @@ -24,6 +24,7 @@ NActors::IActor* CreateS3FileQueueActor( ui64 batchSizeLimit, ui64 batchObjectCountLimit, IHTTPGateway::TPtr gateway, + IHTTPGateway::TRetryPolicy::TPtr retryPolicy, TString url, TS3Credentials::TAuthInfo authInfo, TString pattern, diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp index 8a97cf82f6f2..eb6bebed5624 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp @@ -171,6 +171,7 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped ui64 batchSizeLimit, ui64 batchObjectCountLimit, IHTTPGateway::TPtr gateway, + IHTTPGateway::TRetryPolicy::TPtr retryPolicy, TString url, TS3Credentials::TAuthInfo authInfo, TString pattern, @@ -186,6 +187,7 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped , BatchSizeLimit(batchSizeLimit) , BatchObjectCountLimit(batchObjectCountLimit) , Gateway(std::move(gateway)) + , RetryPolicy(std::move(retryPolicy)) , Url(std::move(url)) , AuthInfo(std::move(authInfo)) , Pattern(std::move(pattern)) @@ -488,6 +490,7 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped CurrentDirectoryPathIndex = object.GetPathIndex(); MaybeLister = NS3Lister::MakeS3Lister( Gateway, + RetryPolicy, NS3Lister::TListingRequest{ Url, AuthInfo, @@ -611,6 +614,7 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped THashSet UpdatedConsumers; const IHTTPGateway::TPtr Gateway; + const IHTTPGateway::TRetryPolicy::TPtr RetryPolicy; const TString Url; const TS3Credentials::TAuthInfo AuthInfo; const TString Pattern; @@ -632,6 +636,7 @@ NActors::IActor* CreateS3FileQueueActor( ui64 batchSizeLimit, ui64 batchObjectCountLimit, IHTTPGateway::TPtr gateway, + IHTTPGateway::TRetryPolicy::TPtr retryPolicy, TString url, TS3Credentials::TAuthInfo authInfo, TString pattern, @@ -648,6 +653,7 @@ NActors::IActor* CreateS3FileQueueActor( batchSizeLimit, batchObjectCountLimit, gateway, + retryPolicy, url, authInfo, pattern, diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h index e2b98a98429d..12a90ffa3faa 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h +++ b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h @@ -22,6 +22,7 @@ NActors::IActor* CreateS3FileQueueActor( ui64 batchSizeLimit, ui64 batchObjectCountLimit, IHTTPGateway::TPtr gateway, + IHTTPGateway::TRetryPolicy::TPtr retryPolicy, TString url, TS3Credentials::TAuthInfo authInfo, TString pattern, diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp index f7c568acc869..15392deac1f1 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp @@ -243,6 +243,7 @@ class TS3Lister : public IS3Lister { TS3Lister( const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, const TListingRequest& listingRequest, const TMaybe& delimiter, size_t maxFilesPerQuery, @@ -265,7 +266,7 @@ class TS3Lister : public IS3Lister { NewPromise>(), std::make_shared(), IHTTPGateway::TWeakPtr(httpGateway), - GetHTTPDefaultRetryPolicy(), + retryPolicy, CreateGuidAsString(), std::move(request), delimiter, @@ -391,7 +392,7 @@ class TS3Lister : public IS3Lister { NewPromise>(), std::make_shared(), ctx.GatewayWeak, - GetHTTPDefaultRetryPolicy(), + ctx.RetryPolicy, CreateGuidAsString(), ctx.ListingRequest, ctx.Delimiter, @@ -457,15 +458,16 @@ class TS3ParallelLimitedListerFactory : public IS3ListerFactory { TFuture Make( const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, const NS3Lister::TListingRequest& listingRequest, const TMaybe& delimiter, bool allowLocalFiles) override { auto acquired = Semaphore->AcquireAsync(); return acquired.Apply( - [ctx = SharedCtx, httpGateway, listingRequest, delimiter, allowLocalFiles](const auto& f) { + [ctx = SharedCtx, httpGateway, retryPolicy, listingRequest, delimiter, allowLocalFiles](const auto& f) { return std::shared_ptr(new TListerLockReleaseWrapper{ NS3Lister::MakeS3Lister( - httpGateway, listingRequest, delimiter, allowLocalFiles, ctx), + httpGateway, retryPolicy, listingRequest, delimiter, allowLocalFiles, ctx), std::make_unique( f.GetValue()->MakeAutoRelease())}); }); @@ -507,13 +509,14 @@ class TS3ParallelLimitedListerFactory : public IS3ListerFactory { IS3Lister::TPtr MakeS3Lister( const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, const TListingRequest& listingRequest, const TMaybe& delimiter, bool allowLocalFiles, TSharedListingContextPtr sharedCtx) { if (listingRequest.Url.substr(0, 7) != "file://") { return std::make_shared( - httpGateway, listingRequest, delimiter, 1000, std::move(sharedCtx)); + httpGateway, retryPolicy, listingRequest, delimiter, 1000, std::move(sharedCtx)); } if (!allowLocalFiles) { diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h index bc6865ecee4d..24d70e4f9cae 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h @@ -165,6 +165,7 @@ class IS3Lister : public TIterator> { IS3Lister::TPtr MakeS3Lister( const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, const TListingRequest& listingRequest, const TMaybe& delimiter, bool allowLocalFiles, @@ -176,6 +177,7 @@ class IS3ListerFactory { virtual NThreading::TFuture Make( const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, const NS3Lister::TListingRequest& listingRequest, const TMaybe& delimiter, bool allowLocalFiles) = 0; diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp index 950dd6eac4f5..c9f3d40a42cb 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp @@ -540,6 +540,7 @@ class TS3DqIntegration: public TDqIntegrationBase { fileQueueBatchSizeLimit, fileQueueBatchObjectCountLimit, State_->Gateway, + State_->GatewayRetryPolicy, connect.Url, GetAuthInfo(State_->CredentialsFactory, State_->Configuration->Tokens.at(cluster)), pathPattern, diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp index 6d3f3275ae28..155810c3485b 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp @@ -88,6 +88,7 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { State_->Configuration->RegexpCacheSize)) , ListingStrategy_(MakeS3ListingStrategy( State_->Gateway, + State_->GatewayRetryPolicy, ListerFactory_, State_->Configuration->MinDesiredDirectoriesOfFilesPerQuery, State_->Configuration->MaxInflightListsPerQuery, diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp index b6e1722dade9..843ba0bcf434 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp @@ -134,14 +134,15 @@ class TFlatFileS3ListingStrategy : public TCollectingS3ListingStrategy { TFlatFileS3ListingStrategy( const IS3ListerFactory::TPtr& listerFactory, const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, bool allowLocalFiles) : TCollectingS3ListingStrategy( - [allowLocalFiles, httpGateway, listerFactory]( + [allowLocalFiles, httpGateway, retryPolicy, listerFactory]( const TListingRequest& listingRequest, TS3ListingOptions options) { Y_UNUSED(options); return listerFactory->Make( - httpGateway, listingRequest, Nothing(), allowLocalFiles); + httpGateway, retryPolicy, listingRequest, Nothing(), allowLocalFiles); }, "TFlatFileS3ListingStrategy") { } }; @@ -151,14 +152,15 @@ class TDirectoryS3ListingStrategy : public TCollectingS3ListingStrategy { TDirectoryS3ListingStrategy( const IS3ListerFactory::TPtr& listerFactory, const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, bool allowLocalFiles) : TCollectingS3ListingStrategy( - [allowLocalFiles, httpGateway, listerFactory]( + [allowLocalFiles, httpGateway, retryPolicy, listerFactory]( const TListingRequest& listingRequest, TS3ListingOptions options) { Y_UNUSED(options); return listerFactory->Make( - httpGateway, listingRequest, "/", allowLocalFiles); + httpGateway, retryPolicy, listingRequest, "/", allowLocalFiles); }, "TDirectoryS3ListingStrategy") { } }; @@ -402,9 +404,10 @@ class TPartitionedDatasetS3ListingStrategy : public TCollectingS3ListingStrategy TPartitionedDatasetS3ListingStrategy( const IS3ListerFactory::TPtr& listerFactory, const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, bool allowLocalFiles) : TCollectingS3ListingStrategy( - [listerFactory, httpGateway, allowLocalFiles]( + [listerFactory, httpGateway, retryPolicy, allowLocalFiles]( const TListingRequest& listingRequest, TS3ListingOptions options) { auto ptr = std::shared_ptr( @@ -413,7 +416,7 @@ class TPartitionedDatasetS3ListingStrategy : public TCollectingS3ListingStrategy listingRequest.Prefix, options, TDirectoryS3ListingStrategy{ - listerFactory, httpGateway, allowLocalFiles}}); + listerFactory, httpGateway, retryPolicy, allowLocalFiles}}); return MakeFuture(std::move(ptr)); }, "TPartitionedDatasetS3ListingStrategy") { } @@ -557,10 +560,11 @@ class TUnPartitionedDatasetS3ListingStrategy : public TCollectingS3ListingStrate TUnPartitionedDatasetS3ListingStrategy( const IS3ListerFactory::TPtr& listerFactory, const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, size_t minParallelism, bool allowLocalFiles) : TCollectingS3ListingStrategy( - [listerFactory, httpGateway, minParallelism, allowLocalFiles]( + [listerFactory, httpGateway, retryPolicy, minParallelism, allowLocalFiles]( const TListingRequest& listingRequest, TS3ListingOptions options) { auto ptr = std::shared_ptr( @@ -579,7 +583,7 @@ class TUnPartitionedDatasetS3ListingStrategy : public TCollectingS3ListingStrate : listingRequest.Pattern.substr( 0, NS3::GetFirstWildcardPos(listingRequest.Pattern))}, TDirectoryS3ListingStrategy{ - listerFactory, httpGateway, allowLocalFiles}, + listerFactory, httpGateway, retryPolicy, allowLocalFiles}, minParallelism, options.MaxResultSet}); return MakeFuture(std::move(ptr)); @@ -893,11 +897,12 @@ class TConcurrentUnPartitionedDatasetS3ListingStrategy : TConcurrentUnPartitionedDatasetS3ListingStrategy( const IS3ListerFactory::TPtr& listerFactory, const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, size_t minParallelism, size_t maxParallelOps, bool allowLocalFiles) : TCollectingS3ListingStrategy( - [listerFactory, httpGateway, minParallelism, allowLocalFiles, maxParallelOps]( + [listerFactory, httpGateway, retryPolicy, minParallelism, allowLocalFiles, maxParallelOps]( const TListingRequest& listingRequest, TS3ListingOptions options) { auto ptr = std::shared_ptr( @@ -929,7 +934,7 @@ class TConcurrentUnPartitionedDatasetS3ListingStrategy : : listingRequest.Pattern.substr( 0, NS3::GetFirstWildcardPos(listingRequest.Pattern))}, TDirectoryS3ListingStrategy{ - listerFactory, httpGateway, allowLocalFiles}, + listerFactory, httpGateway, retryPolicy, allowLocalFiles}, options.MaxResultSet, maxParallelOps}); return MakeFuture(std::move(ptr)); @@ -943,10 +948,11 @@ class TConcurrentPartitionedDatasetS3ListingStrategy : TConcurrentPartitionedDatasetS3ListingStrategy( const IS3ListerFactory::TPtr& listerFactory, const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, size_t maxParallelOps, bool allowLocalFiles) : TCollectingS3ListingStrategy( - [listerFactory, httpGateway, allowLocalFiles, maxParallelOps]( + [listerFactory, httpGateway, retryPolicy, allowLocalFiles, maxParallelOps]( const TListingRequest& listingRequest, TS3ListingOptions options) { auto ptr = std::shared_ptr( @@ -974,12 +980,12 @@ class TConcurrentPartitionedDatasetS3ListingStrategy : : listingRequest.Pattern.substr( 0, NS3::GetFirstWildcardPos(listingRequest.Pattern))}, TDirectoryS3ListingStrategy{ - listerFactory, httpGateway, allowLocalFiles}, + listerFactory, httpGateway, retryPolicy, allowLocalFiles}, options.MaxResultSet, maxParallelOps}); return MakeFuture(std::move(ptr)); }, - "TConcurrentUnPartitionedDatasetS3ListingStrategy") { } + "TConcurrentPartitionedDatasetS3ListingStrategy") { } }; @@ -1024,6 +1030,7 @@ class TLoggingS3ListingStrategy : public IS3ListingStrategy { IS3ListingStrategy::TPtr MakeS3ListingStrategy( const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, const IS3ListerFactory::TPtr& listerFactory, ui64 minDesiredDirectoriesOfFilesPerQuery, size_t maxParallelOps, @@ -1032,7 +1039,7 @@ IS3ListingStrategy::TPtr MakeS3ListingStrategy( std::make_shared( std::vector>{ std::make_shared( - listerFactory, httpGateway, allowLocalFiles), + listerFactory, httpGateway, retryPolicy, allowLocalFiles), std::make_shared( std::initializer_list{ {[](const TS3ListingOptions& options) { @@ -1042,6 +1049,7 @@ IS3ListingStrategy::TPtr MakeS3ListingStrategy( std::make_shared( listerFactory, httpGateway, + retryPolicy, allowLocalFiles)}, {[](const TS3ListingOptions& options) { return options.IsPartitionedDataset && @@ -1050,6 +1058,7 @@ IS3ListingStrategy::TPtr MakeS3ListingStrategy( std::make_shared( listerFactory, httpGateway, + retryPolicy, maxParallelOps, allowLocalFiles)}, {[](const TS3ListingOptions& options) { @@ -1059,6 +1068,7 @@ IS3ListingStrategy::TPtr MakeS3ListingStrategy( std::make_shared( listerFactory, httpGateway, + retryPolicy, minDesiredDirectoriesOfFilesPerQuery, allowLocalFiles)}, {[](const TS3ListingOptions& options) { @@ -1068,6 +1078,7 @@ IS3ListingStrategy::TPtr MakeS3ListingStrategy( std::make_shared( listerFactory, httpGateway, + retryPolicy, minDesiredDirectoriesOfFilesPerQuery, maxParallelOps, allowLocalFiles)}})})); diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.h b/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.h index 1c1a8d8acaca..611e2dce1368 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.h +++ b/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.h @@ -31,6 +31,7 @@ class IS3ListingStrategy { IS3ListingStrategy::TPtr MakeS3ListingStrategy( const IHTTPGateway::TPtr& httpGateway, + const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, const NS3Lister::IS3ListerFactory::TPtr& listerFactory, ui64 minDesiredDirectoriesOfFilesPerQuery, size_t maxParallelOps, diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_provider.h b/ydb/library/yql/providers/s3/provider/yql_s3_provider.h index 49c805c707f3..0bcf96290c7a 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_provider.h +++ b/ydb/library/yql/providers/s3/provider/yql_s3_provider.h @@ -2,6 +2,7 @@ #include #include +#include #include #include "yql_s3_settings.h" @@ -28,6 +29,7 @@ struct TS3State : public TThrRefBase const NKikimr::NMiniKQL::IFunctionRegistry* FunctionRegistry = nullptr; ISecuredServiceAccountCredentialsFactory::TPtr CredentialsFactory; IHTTPGateway::TPtr Gateway; + IHTTPGateway::TRetryPolicy::TPtr GatewayRetryPolicy = GetHTTPDefaultRetryPolicy(); ui32 ExecutorPoolId = 0; std::list> PrimaryKeys; }; From 0af35b6e6874f2f3f03709b315b7940af2ad1bf4 Mon Sep 17 00:00:00 2001 From: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> Date: Mon, 22 Jul 2024 19:44:56 +0300 Subject: [PATCH 03/56] YQ-3152 fix error failed to execute callable ResWrite! (#6940) --- ydb/core/external_sources/object_storage.cpp | 21 ++++++++++------- .../inference/ut/arrow_inference_ut.cpp | 2 +- .../object_storage/s3_fetcher.cpp | 17 +++++++------- .../object_storage/s3_fetcher.h | 2 +- .../s3/actors/yql_s3_raw_read_actor.cpp | 21 +++++++++-------- .../s3/actors/yql_s3_raw_read_actor.h | 2 +- .../providers/s3/actors/yql_s3_read_actor.cpp | 17 +++++++------- .../providers/s3/actors/yql_s3_read_actor.h | 2 +- .../s3/actors/yql_s3_source_queue.cpp | 12 +++++----- .../providers/s3/actors/yql_s3_source_queue.h | 2 +- .../providers/s3/credentials/credentials.cpp | 14 ++++++++++- .../providers/s3/credentials/credentials.h | 5 ++++ .../s3/object_listers/yql_s3_list.cpp | 7 +++--- .../providers/s3/object_listers/yql_s3_list.h | 2 +- .../s3/provider/yql_s3_dq_integration.cpp | 2 +- .../s3/provider/yql_s3_io_discovery.cpp | 23 ++++++++++++------- 16 files changed, 91 insertions(+), 60 deletions(-) diff --git a/ydb/core/external_sources/object_storage.cpp b/ydb/core/external_sources/object_storage.cpp index 36a87a69b8cd..37e5eaa0fa1b 100644 --- a/ydb/core/external_sources/object_storage.cpp +++ b/ydb/core/external_sources/object_storage.cpp @@ -10,9 +10,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -284,12 +286,13 @@ struct TObjectStorageExternalSource : public IExternalSource { return NThreading::MakeFuture(std::move(meta)); } - NYql::TS3Credentials::TAuthInfo authInfo{}; + NYql::TStructuredTokenBuilder structuredTokenBuilder; if (std::holds_alternative(meta->Auth)) { auto& awsAuth = std::get(meta->Auth); - authInfo.AwsAccessKey = awsAuth.AccessKey; - authInfo.AwsAccessSecret = awsAuth.SecretAccessKey; - authInfo.AwsRegion = awsAuth.Region; + NYql::NS3::TAwsParams params; + params.SetAwsAccessKey(awsAuth.AccessKey); + params.SetAwsRegion(awsAuth.Region); + structuredTokenBuilder.SetBasicAuth(params.SerializeAsString(), awsAuth.SecretAccessKey); } else if (std::holds_alternative(meta->Auth)) { if (!CredentialsFactory) { try { @@ -299,15 +302,17 @@ struct TObjectStorageExternalSource : public IExternalSource { } } auto& saAuth = std::get(meta->Auth); - NYql::GetAuthInfo(CredentialsFactory, ""); - authInfo.Token = CredentialsFactory->Create(saAuth.ServiceAccountId, saAuth.ServiceAccountIdSignature)->CreateProvider()->GetAuthInfo(); + structuredTokenBuilder.SetServiceAccountIdAuth(saAuth.ServiceAccountId, saAuth.ServiceAccountIdSignature); + } else { + structuredTokenBuilder.SetNoAuth(); } + const NYql::TS3Credentials credentials(CredentialsFactory, structuredTokenBuilder.ToJson()); auto httpGateway = NYql::IHTTPGateway::Make(); auto httpRetryPolicy = NYql::GetHTTPDefaultRetryPolicy(NYql::THttpRetryPolicyOptions{.RetriedCurlCodes = NYql::FqRetriedCurlCodes()}); auto s3Lister = NYql::NS3Lister::MakeS3Lister(httpGateway, httpRetryPolicy, NYql::NS3Lister::TListingRequest{ .Url = meta->DataSourceLocation, - .AuthInfo = authInfo, + .Credentials = credentials, .Pattern = meta->TableLocation, }, Nothing(), false); auto afterListing = s3Lister->Next().Apply([path = meta->TableLocation](const NThreading::TFuture& listResFut) { @@ -332,7 +337,7 @@ struct TObjectStorageExternalSource : public IExternalSource { meta->DataSourceLocation, httpGateway, NYql::IHTTPGateway::TRetryPolicy::GetNoRetryPolicy(), - std::move(authInfo) + credentials )); meta->Attributes.erase("withinfer"); diff --git a/ydb/core/external_sources/object_storage/inference/ut/arrow_inference_ut.cpp b/ydb/core/external_sources/object_storage/inference/ut/arrow_inference_ut.cpp index 8edd7a424212..88a46386035f 100644 --- a/ydb/core/external_sources/object_storage/inference/ut/arrow_inference_ut.cpp +++ b/ydb/core/external_sources/object_storage/inference/ut/arrow_inference_ut.cpp @@ -45,7 +45,7 @@ class ArrowInferenceTest : public testing::Test { BaseUrl, Gateway, NYql::IHTTPGateway::TRetryPolicy::GetNoRetryPolicy(), - NYql::TS3Credentials::TAuthInfo{}), 1); + NYql::TS3Credentials{}), 1); } NActors::TActorId RegisterInferencinator(TStringBuf formatStr) { diff --git a/ydb/core/external_sources/object_storage/s3_fetcher.cpp b/ydb/core/external_sources/object_storage/s3_fetcher.cpp index 1238147ee089..c9dc7ca45e32 100644 --- a/ydb/core/external_sources/object_storage/s3_fetcher.cpp +++ b/ydb/core/external_sources/object_storage/s3_fetcher.cpp @@ -10,11 +10,11 @@ class S3Fetcher : public NActors::TActorBootstrapped { TString url, NYql::IHTTPGateway::TPtr gateway, NYql::IHTTPGateway::TRetryPolicy::TPtr retryPolicy, - NYql::TS3Credentials::TAuthInfo authInfo) + const NYql::TS3Credentials& credentials) : Url_{std::move(url)} , Gateway_{std::move(gateway)} , RetryPolicy_{std::move(retryPolicy)} - , AuthInfo_{std::move(authInfo)} + , Credentials_(credentials) {} void Bootstrap() { @@ -60,12 +60,13 @@ class S3Fetcher : public NActors::TActorBootstrapped { void StartDownload(std::shared_ptr&& request, NActors::TActorSystem* actorSystem) { auto length = request->End - request->Start; + const auto& authInfo = Credentials_.GetAuthInfo(); auto headers = NYql::IHTTPGateway::MakeYcHeaders( request->RequestId.AsGuidString(), - AuthInfo_.GetToken(), + authInfo.GetToken(), {}, - AuthInfo_.GetAwsUserPwd(), - AuthInfo_.GetAwsSigV4() + authInfo.GetAwsUserPwd(), + authInfo.GetAwsSigV4() ); Gateway_->Download( @@ -79,15 +80,15 @@ class S3Fetcher : public NActors::TActorBootstrapped { TString Url_; NYql::IHTTPGateway::TPtr Gateway_; NYql::IHTTPGateway::TRetryPolicy::TPtr RetryPolicy_; - NYql::TS3Credentials::TAuthInfo AuthInfo_; + const NYql::TS3Credentials Credentials_; }; NActors::IActor* CreateS3FetcherActor( TString url, NYql::IHTTPGateway::TPtr gateway, NYql::IHTTPGateway::TRetryPolicy::TPtr retryPolicy, - NYql::TS3Credentials::TAuthInfo authInfo) { + const NYql::TS3Credentials& credentials) { - return new S3Fetcher(std::move(url), std::move(gateway), std::move(retryPolicy), std::move(authInfo)); + return new S3Fetcher(std::move(url), std::move(gateway), std::move(retryPolicy), credentials); } } // namespace NKikimr::NExternalSource::NObjectStorage diff --git a/ydb/core/external_sources/object_storage/s3_fetcher.h b/ydb/core/external_sources/object_storage/s3_fetcher.h index 51310ec3ca64..51b77210f5b2 100644 --- a/ydb/core/external_sources/object_storage/s3_fetcher.h +++ b/ydb/core/external_sources/object_storage/s3_fetcher.h @@ -13,5 +13,5 @@ NActors::IActor* CreateS3FetcherActor( TString url, NYql::IHTTPGateway::TPtr gateway, NYql::IHTTPGateway::TRetryPolicy::TPtr retryPolicy, - NYql::TS3Credentials::TAuthInfo authInfo); + const NYql::TS3Credentials& credentials); } // namespace NKikimr::NExternalSource::NObjectStorage diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp index 749c86b9db44..d5bfdd479e2f 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp @@ -42,7 +42,7 @@ class TS3ReadActor : public NActors::TActorBootstrapped, public ID IHTTPGateway::TPtr gateway, const NKikimr::NMiniKQL::THolderFactory& holderFactory, const TString& url, - const TS3Credentials::TAuthInfo& authInfo, + const TS3Credentials& credentials, const TString& pattern, NYql::NS3Lister::ES3PatternVariant patternVariant, NYql::NS3Details::TPathList&& paths, @@ -69,7 +69,7 @@ class TS3ReadActor : public NActors::TActorBootstrapped, public ID , RetryPolicy(retryPolicy) , ActorSystem(NActors::TActivationContext::ActorSystem()) , Url(url) - , AuthInfo(authInfo) + , Credentials(credentials) , Pattern(pattern) , PatternVariant(patternVariant) , Paths(std::move(paths)) @@ -113,7 +113,7 @@ class TS3ReadActor : public NActors::TActorBootstrapped, public ID Gateway, RetryPolicy, Url, - AuthInfo, + Credentials, Pattern, PatternVariant, NYql::NS3Lister::ES3PatternType::Wildcard)); @@ -164,10 +164,11 @@ class TS3ReadActor : public NActors::TActorBootstrapped, public ID auto url = Url + object.GetPath(); auto id = object.GetPathIndex(); const TString requestId = CreateGuidAsString(); + const auto& authInfo = Credentials.GetAuthInfo(); LOG_D("TS3ReadActor", "Download: " << url << ", ID: " << id << ", request id: [" << requestId << "]"); Gateway->Download( UrlEscapeRet(url, true), - IHTTPGateway::MakeYcHeaders(requestId, AuthInfo.GetToken(), {}, AuthInfo.GetAwsUserPwd(), AuthInfo.GetAwsSigV4()), + IHTTPGateway::MakeYcHeaders(requestId, authInfo.GetToken(), {}, authInfo.GetAwsUserPwd(), authInfo.GetAwsSigV4()), 0U, std::min(object.GetSize(), SizeLimit), std::bind(&TS3ReadActor::OnDownloadFinished, ActorSystem, SelfId(), requestId, std::placeholders::_1, id, object.GetPath()), @@ -456,7 +457,7 @@ class TS3ReadActor : public NActors::TActorBootstrapped, public ID NActors::TActorSystem* const ActorSystem; const TString Url; - const TS3Credentials::TAuthInfo AuthInfo; + const TS3Credentials Credentials; const TString Pattern; const NYql::NS3Lister::ES3PatternVariant PatternVariant; NYql::NS3Details::TPathList Paths; @@ -503,7 +504,7 @@ std::pair CreateRawRead IHTTPGateway::TPtr gateway, const NKikimr::NMiniKQL::THolderFactory& holderFactory, const TString& url, - const TS3Credentials::TAuthInfo& authInfo, + const TS3Credentials& credentials, const TString& pattern, NYql::NS3Lister::ES3PatternVariant patternVariant, NYql::NS3Details::TPathList&& paths, @@ -527,14 +528,14 @@ std::pair CreateRawRead statsLevel, txId, std::move(gateway), - holderFactory, - url, - authInfo, + holderFactory, + url, + credentials, pattern, patternVariant, std::move(paths), addPathIndex, - computeActorId, + computeActorId, sizeLimit, retryPolicy, readActorFactoryCfg, diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.h b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.h index 3c76d44bf7d8..102ea19c94c3 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.h +++ b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.h @@ -19,7 +19,7 @@ std::pair CreateRawRead IHTTPGateway::TPtr gateway, const NKikimr::NMiniKQL::THolderFactory& holderFactory, const TString& url, - const TS3Credentials::TAuthInfo& authInfo, + const TS3Credentials& credentials, const TString& pattern, NYql::NS3Lister::ES3PatternVariant patternVariant, NYql::NS3Details::TPathList&& paths, diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp index 358448c2d451..81ec94b2d51a 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp @@ -1201,7 +1201,7 @@ class TS3StreamReadActor : public TActorBootstrapped, public IHTTPGateway::TPtr gateway, const THolderFactory& holderFactory, const TString& url, - const TS3Credentials::TAuthInfo& authInfo, + const TS3Credentials& credentials, const TString& pattern, ES3PatternVariant patternVariant, TPathList&& paths, @@ -1230,7 +1230,7 @@ class TS3StreamReadActor : public TActorBootstrapped, public , ComputeActorId(computeActorId) , RetryPolicy(retryPolicy) , Url(url) - , AuthInfo(authInfo) + , Credentials(credentials) , Pattern(pattern) , PatternVariant(patternVariant) , Paths(std::move(paths)) @@ -1321,7 +1321,7 @@ class TS3StreamReadActor : public TActorBootstrapped, public Gateway, RetryPolicy, Url, - AuthInfo, + Credentials, Pattern, PatternVariant, ES3PatternType::Wildcard)); @@ -1385,10 +1385,11 @@ class TS3StreamReadActor : public TActorBootstrapped, public << pathIndex); TActorId actorId; + const auto& authInfo = Credentials.GetAuthInfo(); auto stuff = std::make_shared( Gateway, Url + object.GetPath(), - IHTTPGateway::MakeYcHeaders(requestId, AuthInfo.GetToken(), {}, AuthInfo.GetAwsUserPwd(), AuthInfo.GetAwsSigV4()), + IHTTPGateway::MakeYcHeaders(requestId, authInfo.GetToken(), {}, authInfo.GetAwsUserPwd(), authInfo.GetAwsSigV4()), object.GetSize(), TxId, requestId, @@ -1786,7 +1787,7 @@ class TS3StreamReadActor : public TActorBootstrapped, public const IHTTPGateway::TRetryPolicy::TPtr RetryPolicy; const TString Url; - const TS3Credentials::TAuthInfo AuthInfo; + const TS3Credentials Credentials; const TString Pattern; const ES3PatternVariant PatternVariant; TPathList Paths; @@ -2000,7 +2001,7 @@ std::pair CreateS3ReadActor( ReadPathsList(params, taskParams, readRanges, paths); const auto token = secureParams.Value(params.GetToken(), TString{}); - const auto authInfo = GetAuthInfo(credentialsFactory, token); + const TS3Credentials credentials(credentialsFactory, token); const auto& settings = params.GetSettings(); TString pathPattern = "*"; @@ -2178,7 +2179,7 @@ std::pair CreateS3ReadActor( sizeLimit = FromString(it->second); } - const auto actor = new TS3StreamReadActor(inputIndex, statsLevel, txId, std::move(gateway), holderFactory, params.GetUrl(), authInfo, pathPattern, pathPatternVariant, + const auto actor = new TS3StreamReadActor(inputIndex, statsLevel, txId, std::move(gateway), holderFactory, params.GetUrl(), credentials, pathPattern, pathPatternVariant, std::move(paths), addPathIndex, readSpec, computeActorId, retryPolicy, cfg, counters, taskCounters, fileSizeLimit, sizeLimit, rowsLimitHint, memoryQuotaManager, params.GetUseRuntimeListing(), fileQueueActor, fileQueueBatchSizeLimit, fileQueueBatchObjectCountLimit, fileQueueConsumersCountDelta, @@ -2190,7 +2191,7 @@ std::pair CreateS3ReadActor( if (const auto it = settings.find("sizeLimit"); settings.cend() != it) sizeLimit = FromString(it->second); - return CreateRawReadActor(inputIndex, statsLevel, txId, std::move(gateway), holderFactory, params.GetUrl(), authInfo, pathPattern, pathPatternVariant, + return CreateRawReadActor(inputIndex, statsLevel, txId, std::move(gateway), holderFactory, params.GetUrl(), credentials, pathPattern, pathPatternVariant, std::move(paths), addPathIndex, computeActorId, sizeLimit, retryPolicy, cfg, counters, taskCounters, fileSizeLimit, rowsLimitHint, params.GetUseRuntimeListing(), fileQueueActor, fileQueueBatchSizeLimit, fileQueueBatchObjectCountLimit, fileQueueConsumersCountDelta); diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h index 69de502f94e4..5de66acf6f1f 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h @@ -26,7 +26,7 @@ NActors::IActor* CreateS3FileQueueActor( IHTTPGateway::TPtr gateway, IHTTPGateway::TRetryPolicy::TPtr retryPolicy, TString url, - TS3Credentials::TAuthInfo authInfo, + const TS3Credentials& credentials, TString pattern, NYql::NS3Lister::ES3PatternVariant patternVariant, NS3Lister::ES3PatternType patternType); diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp index eb6bebed5624..060afbb4aea5 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp @@ -173,7 +173,7 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped IHTTPGateway::TPtr gateway, IHTTPGateway::TRetryPolicy::TPtr retryPolicy, TString url, - TS3Credentials::TAuthInfo authInfo, + const TS3Credentials& credentials, TString pattern, NS3Lister::ES3PatternVariant patternVariant, NS3Lister::ES3PatternType patternType) @@ -189,7 +189,7 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped , Gateway(std::move(gateway)) , RetryPolicy(std::move(retryPolicy)) , Url(std::move(url)) - , AuthInfo(std::move(authInfo)) + , Credentials(credentials) , Pattern(std::move(pattern)) , PatternVariant(patternVariant) , PatternType(patternType) { @@ -493,7 +493,7 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped RetryPolicy, NS3Lister::TListingRequest{ Url, - AuthInfo, + Credentials, PatternVariant == NS3Lister::ES3PatternVariant::PathPattern ? Pattern : TStringBuilder{} << object.GetPath() << Pattern, @@ -616,7 +616,7 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped const IHTTPGateway::TPtr Gateway; const IHTTPGateway::TRetryPolicy::TPtr RetryPolicy; const TString Url; - const TS3Credentials::TAuthInfo AuthInfo; + const TS3Credentials Credentials; const TString Pattern; const NS3Lister::ES3PatternVariant PatternVariant; const NS3Lister::ES3PatternType PatternType; @@ -638,7 +638,7 @@ NActors::IActor* CreateS3FileQueueActor( IHTTPGateway::TPtr gateway, IHTTPGateway::TRetryPolicy::TPtr retryPolicy, TString url, - TS3Credentials::TAuthInfo authInfo, + const TS3Credentials& credentials, TString pattern, NS3Lister::ES3PatternVariant patternVariant, NS3Lister::ES3PatternType patternType) { @@ -655,7 +655,7 @@ NActors::IActor* CreateS3FileQueueActor( gateway, retryPolicy, url, - authInfo, + credentials, pattern, patternVariant, patternType diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h index 12a90ffa3faa..86fd9aa1d385 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h +++ b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h @@ -24,7 +24,7 @@ NActors::IActor* CreateS3FileQueueActor( IHTTPGateway::TPtr gateway, IHTTPGateway::TRetryPolicy::TPtr retryPolicy, TString url, - TS3Credentials::TAuthInfo authInfo, + const TS3Credentials& credentials, TString pattern, NYql::NS3Lister::ES3PatternVariant patternVariant, NS3Lister::ES3PatternType patternType); diff --git a/ydb/library/yql/providers/s3/credentials/credentials.cpp b/ydb/library/yql/providers/s3/credentials/credentials.cpp index eedd583778de..5c0ecf85b1d4 100644 --- a/ydb/library/yql/providers/s3/credentials/credentials.cpp +++ b/ydb/library/yql/providers/s3/credentials/credentials.cpp @@ -6,6 +6,7 @@ namespace NYql { TS3Credentials::TS3Credentials(ISecuredServiceAccountCredentialsFactory::TPtr factory, const TString& structuredTokenJson, bool addBearerToToken) + : StructuredTokenJson(structuredTokenJson) { if (NYql::IsStructuredTokenJson(structuredTokenJson)) { NYql::TStructuredTokenParser parser = NYql::CreateStructuredTokenParser(structuredTokenJson); @@ -24,7 +25,7 @@ TS3Credentials::TS3Credentials(ISecuredServiceAccountCredentialsFactory::TPtr fa } auto providerFactory = CreateCredentialsProviderFactoryForStructuredToken(factory, structuredTokenJson, addBearerToToken); - CredentialsProvider = providerFactory->CreateProvider(); + CredentialsProvider = providerFactory->CreateProvider(); // Heavy operation, BLOCKs thread until TA reply } TS3Credentials::TAuthInfo TS3Credentials::GetAuthInfo() const { @@ -34,6 +35,17 @@ TS3Credentials::TAuthInfo TS3Credentials::GetAuthInfo() const { return AuthInfo; } +bool TS3Credentials::operator<(const TS3Credentials& other) const { + return StructuredTokenJson < other.StructuredTokenJson; +} + +IOutputStream& operator<<(IOutputStream& stream, const TS3Credentials& credentials) { + const auto& authInfo = credentials.AuthInfo; + return stream << "TS3Credentials{.ServiceAccountAuth=" << static_cast(credentials.CredentialsProvider) + << ",.AwsUserPwd=" + << ",.AwsSigV4=}"; +} + // string value after AWS prefix should be suitable for passing it to curl as CURLOPT_USERPWD, see details here: // https://curl.se/libcurl/c/CURLOPT_AWS_SIGV4.html // CURLOPT_USERPWD = "MY_ACCESS_KEY:MY_SECRET_KEY" diff --git a/ydb/library/yql/providers/s3/credentials/credentials.h b/ydb/library/yql/providers/s3/credentials/credentials.h index 3d9b41ea75a4..4c299c9d015f 100644 --- a/ydb/library/yql/providers/s3/credentials/credentials.h +++ b/ydb/library/yql/providers/s3/credentials/credentials.h @@ -19,11 +19,16 @@ struct TS3Credentials { TString AwsRegion; }; + TS3Credentials() = default; TS3Credentials(ISecuredServiceAccountCredentialsFactory::TPtr factory, const TString& structuredTokenJson, bool addBearerToToken = false); TAuthInfo GetAuthInfo() const; + bool operator<(const TS3Credentials& other) const; + friend IOutputStream& operator<<(IOutputStream& stream, const TS3Credentials& credentials); + private: + TString StructuredTokenJson; NYdb::TCredentialsProviderPtr CredentialsProvider; TS3Credentials::TAuthInfo AuthInfo; }; diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp index 15392deac1f1..32f2df4629b0 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp @@ -29,9 +29,7 @@ IOutputStream& operator<<(IOutputStream& stream, const TListingRequest& request) << ",.Prefix=" << request.Prefix << ",.Pattern=" << request.Pattern << ",.PatternType=" << request.PatternType - << ",.AwsUserPwd=" - << ",.AwsSigV4=" << request.AuthInfo.GetAwsSigV4().length() - << ",.Token=}"; + << ",.Credentials=" << request.Credentials << "}"; } namespace { @@ -287,7 +285,8 @@ class TS3Lister : public IS3Lister { ~TS3Lister() override = default; private: static void SubmitRequestIntoGateway(TListingContext& ctx) { - IHTTPGateway::THeaders headers = IHTTPGateway::MakeYcHeaders(ctx.RequestId, ctx.ListingRequest.AuthInfo.GetToken(), {}, ctx.ListingRequest.AuthInfo.GetAwsUserPwd(), ctx.ListingRequest.AuthInfo.GetAwsSigV4()); + const auto& authInfo = ctx.ListingRequest.Credentials.GetAuthInfo(); + IHTTPGateway::THeaders headers = IHTTPGateway::MakeYcHeaders(ctx.RequestId, authInfo.GetToken(), {}, authInfo.GetAwsUserPwd(), authInfo.GetAwsSigV4()); // We have to sort the cgi parameters for the correct aws signature // This requirement will be fixed in the curl library diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h index 24d70e4f9cae..93fafae19057 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h @@ -150,7 +150,7 @@ using TListResult = std::variant; struct TListingRequest { TString Url; - TS3Credentials::TAuthInfo AuthInfo; + TS3Credentials Credentials; TString Pattern; ES3PatternType PatternType = ES3PatternType::Wildcard; TString Prefix; diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp index c9f3d40a42cb..b90ada844c3b 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp @@ -542,7 +542,7 @@ class TS3DqIntegration: public TDqIntegrationBase { State_->Gateway, State_->GatewayRetryPolicy, connect.Url, - GetAuthInfo(State_->CredentialsFactory, State_->Configuration->Tokens.at(cluster)), + TS3Credentials(State_->CredentialsFactory, State_->Configuration->Tokens.at(cluster)), pathPattern, pathPatternVariant, NS3Lister::ES3PatternType::Wildcard diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp index 155810c3485b..2c1ee3313622 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp @@ -63,10 +63,8 @@ struct TListRequest { }; bool operator<(const TListRequest& a, const TListRequest& b) { - const auto& lhs = a.S3Request.AuthInfo; - const auto& rhs = b.S3Request.AuthInfo; - return std::tie(lhs.Token, lhs.AwsAccessKey, lhs.AwsAccessSecret, lhs.AwsRegion, a.S3Request.Url, a.S3Request.Pattern) < - std::tie(rhs.Token, rhs.AwsAccessKey, rhs.AwsAccessSecret, rhs.AwsRegion, b.S3Request.Url, b.S3Request.Pattern); + return std::tie(a.S3Request.Credentials, a.S3Request.Url, a.S3Request.Pattern) < + std::tie(b.S3Request.Credentials, b.S3Request.Url, b.S3Request.Pattern); } using TPendingRequests = TMap>; @@ -588,7 +586,7 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { const auto& connect = State_->Configuration->Clusters.at(dataSource.Cluster().StringValue()); const auto& token = State_->Configuration->Tokens.at(dataSource.Cluster().StringValue()); - const auto authInfo = GetAuthInfo(State_->CredentialsFactory, token); + const auto& credentials = GetOrCreateCredentials(token); const TString url = connect.Url; auto s3ParseSettings = source.Input().Maybe().Cast(); TString filePattern; @@ -620,7 +618,7 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { auto req = TListRequest{.S3Request{ .Url = url, - .AuthInfo = authInfo, + .Credentials = credentials, .Pattern = NS3::NormalizePath( TStringBuilder() << dir.Path << "/" << effectiveFilePattern), .PatternType = NS3Lister::ES3PatternType::Wildcard, @@ -744,7 +742,7 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { const auto& connect = State_->Configuration->Clusters.at(read.DataSource().Cluster().StringValue()); const auto& token = State_->Configuration->Tokens.at(read.DataSource().Cluster().StringValue()); - const auto authInfo = GetAuthInfo(State_->CredentialsFactory, token); + const auto& credentials = GetOrCreateCredentials(token); const TString url = connect.Url; TGeneratedColumnsConfig config; @@ -772,7 +770,7 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { State_->Configuration->UseConcurrentDirectoryLister.Get().GetOrElse( State_->Configuration->AllowConcurrentListings); auto req = TListRequest{ - .S3Request{.Url = url, .AuthInfo = authInfo}, + .S3Request{.Url = url, .Credentials = credentials}, .FilePattern = effectiveFilePattern, .Options{ .IsConcurrentListing = isConcurrentListingEnabled, @@ -880,6 +878,14 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { return true; } + TS3Credentials GetOrCreateCredentials(const TString& token) { + auto it = S3Credentials_.find(token); + if (it != S3Credentials_.end()) { + return it->second; + } + return S3Credentials_.insert({token, TS3Credentials(State_->CredentialsFactory, token)}).first->second; + } + const TS3State::TPtr State_; const NS3Lister::IS3ListerFactory::TPtr ListerFactory_; const IS3ListingStrategy::TPtr ListingStrategy_; @@ -887,6 +893,7 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { TPendingRequests PendingRequests_; TNodeMap> RequestsByNode_; TNodeMap GenColumnsByNode_; + std::unordered_map S3Credentials_; NThreading::TFuture AllFuture_; }; From 9210980c0ff6fa739157d360a5e2f44d42772ef7 Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Tue, 23 Jul 2024 08:30:03 +0200 Subject: [PATCH 04/56] AsyncDecompressing + lz4 fix (#6889) --- .../hive_metastore/ut/common.cpp | 2 +- .../s3/ut/s3_aws_credentials_ut.cpp | 10 +- ydb/library/yql/providers/s3/actors/ya.make | 1 + .../s3/actors/yql_s3_decompressor_actor.cpp | 127 ++++++++++++++++++ .../s3/actors/yql_s3_decompressor_actor.h | 9 ++ .../providers/s3/actors/yql_s3_read_actor.cpp | 93 +++++++++++-- .../yql/providers/s3/compressors/lz4io.cpp | 46 ++++--- .../s3/compressors/ut/decompressor_ut.cpp | 45 +++++++ .../ut/test_compression_data/test.broken.lz4 | Bin 0 -> 79 bytes .../ut/test_compression_data/test.json | 8 ++ .../ut/test_compression_data/test.json.lz4 | Bin 0 -> 79 bytes .../yql/providers/s3/compressors/ut/ya.make | 24 ++++ .../yql/providers/s3/compressors/ya.make | 3 + ydb/library/yql/providers/s3/events/events.h | 20 +++ .../yql/providers/s3/proto/source.proto | 1 + .../s3/provider/yql_s3_dq_integration.cpp | 1 + .../providers/s3/provider/yql_s3_settings.cpp | 1 + .../providers/s3/provider/yql_s3_settings.h | 1 + 18 files changed, 356 insertions(+), 36 deletions(-) create mode 100644 ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.cpp create mode 100644 ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.h create mode 100644 ydb/library/yql/providers/s3/compressors/ut/decompressor_ut.cpp create mode 100644 ydb/library/yql/providers/s3/compressors/ut/test_compression_data/test.broken.lz4 create mode 100644 ydb/library/yql/providers/s3/compressors/ut/test_compression_data/test.json create mode 100644 ydb/library/yql/providers/s3/compressors/ut/test_compression_data/test.json.lz4 create mode 100644 ydb/library/yql/providers/s3/compressors/ut/ya.make diff --git a/ydb/core/external_sources/hive_metastore/ut/common.cpp b/ydb/core/external_sources/hive_metastore/ut/common.cpp index cb99adf37a1d..94433fce74b5 100644 --- a/ydb/core/external_sources/hive_metastore/ut/common.cpp +++ b/ydb/core/external_sources/hive_metastore/ut/common.cpp @@ -25,7 +25,7 @@ TString Exec(const TString& cmd) { TString GetExternalPort(const TString& service, const TString& port) { auto dockerComposeBin = BinaryPath("library/recipes/docker_compose/bin/docker-compose"); - auto composeFileYml = ArcadiaSourceRoot() + "/ydb/core/external_sources/hive_metastore/ut/docker-compose.yml"; + auto composeFileYml = ArcadiaFromCurrentLocation(__SOURCE_FILE__, "docker-compose.yml"); auto result = StringSplitter(Exec(dockerComposeBin + " -f " + composeFileYml + " port " + service + " " + port)).Split(':').ToList(); return result ? Strip(result.back()) : TString{}; } diff --git a/ydb/core/external_sources/s3/ut/s3_aws_credentials_ut.cpp b/ydb/core/external_sources/s3/ut/s3_aws_credentials_ut.cpp index 4eb9da765afa..08ddac807eb1 100644 --- a/ydb/core/external_sources/s3/ut/s3_aws_credentials_ut.cpp +++ b/ydb/core/external_sources/s3/ut/s3_aws_credentials_ut.cpp @@ -1,12 +1,13 @@ +#include #include #include -#include +#include #include #include #include #include -#include #include +#include #include @@ -38,7 +39,7 @@ TString Exec(const TString& cmd) { TString GetExternalPort(const TString& service, const TString& port) { auto dockerComposeBin = BinaryPath("library/recipes/docker_compose/bin/docker-compose"); - auto composeFileYml = ArcadiaSourceRoot() + "/ydb/core/external_sources/s3/ut/docker-compose.yml"; + auto composeFileYml = ArcadiaFromCurrentLocation(__SOURCE_FILE__, "docker-compose.yml"); auto result = StringSplitter(Exec(dockerComposeBin + " -f " + composeFileYml + " port " + service + " " + port)).Split(':').ToList(); return result ? Strip(result.back()) : TString{}; } @@ -46,7 +47,8 @@ TString GetExternalPort(const TString& service, const TString& port) { Y_UNIT_TEST_SUITE(S3AwsCredentials) { Y_UNIT_TEST(ExecuteScriptWithEqSymbol) { const TString externalDataSourceName = "/Root/external_data_source"; - auto kikimr = MakeKikimrRunner(true); + auto s3ActorsFactory = NYql::NDq::CreateS3ActorsFactory(); + auto kikimr = MakeKikimrRunner(true, nullptr, nullptr, std::nullopt, s3ActorsFactory); auto tc = kikimr->GetTableClient(); auto session = tc.CreateSession().GetValueSync().GetSession(); const TString query = fmt::format(R"( diff --git a/ydb/library/yql/providers/s3/actors/ya.make b/ydb/library/yql/providers/s3/actors/ya.make index e1a698ce90d7..66d41194158b 100644 --- a/ydb/library/yql/providers/s3/actors/ya.make +++ b/ydb/library/yql/providers/s3/actors/ya.make @@ -53,6 +53,7 @@ IF (CLANG AND NOT WITH_VALGRIND) SRCS( yql_arrow_column_converters.cpp + yql_s3_decompressor_actor.cpp yql_s3_read_actor.cpp yql_s3_source_queue.cpp ) diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.cpp new file mode 100644 index 000000000000..738ba1f6fb7c --- /dev/null +++ b/ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.cpp @@ -0,0 +1,127 @@ +#include + +#include +#include +#include +#include + +#if defined(_linux_) || defined(_darwin_) +#include +#endif + +namespace NYql::NDq { + +using namespace ::NActors; + +namespace { + +class TS3DecompressorCoroImpl : public TActorCoroImpl { +public: + TS3DecompressorCoroImpl(const TActorId& parent, const TString& compression) + : TActorCoroImpl(256_KB) + , Compression(compression) + , Parent(parent) + {} + +private: + class TCoroReadBuffer : public NDB::ReadBuffer { + public: + TCoroReadBuffer(TS3DecompressorCoroImpl* coro) + : NDB::ReadBuffer(nullptr, 0ULL) + , Coro(coro) + { } + private: + bool nextImpl() final { + while (!Coro->InputFinished || !Coro->Requests.empty()) { + Coro->ProcessOneEvent(); + if (Coro->InputBuffer) { + RawDataBuffer.swap(Coro->InputBuffer); + Coro->InputBuffer.clear(); + auto rawData = const_cast(RawDataBuffer.data()); + working_buffer = NDB::BufferBase::Buffer(rawData, rawData + RawDataBuffer.size()); + return true; + } + } + return false; + } + TS3DecompressorCoroImpl *const Coro; + TString RawDataBuffer; + }; + + STRICT_STFUNC(StateFunc, + hFunc(TEvS3Provider::TEvDecompressDataRequest, Handle); + hFunc(NActors::TEvents::TEvPoison, Handle); + ) + + void Handle(TEvS3Provider::TEvDecompressDataRequest::TPtr& ev) { + Requests.push(std::move(ev->Release())); + } + + void Handle(NActors::TEvents::TEvPoison::TPtr& ev) { + if (ev->Cookie) { + ythrow yexception() << "S3 decompressor actor abort"; + } + InputFinished = true; + } + + void Run() final { + try { + std::unique_ptr coroBuffer = std::make_unique(this); + NDB::ReadBuffer* buffer = coroBuffer.get(); + auto decompressorBuffer = MakeDecompressor(*buffer, Compression); + YQL_ENSURE(decompressorBuffer, "Unsupported " << Compression << " compression."); + while (!decompressorBuffer->eof()) { + decompressorBuffer->nextIfAtEnd(); + TString data{decompressorBuffer->available(), ' '}; + decompressorBuffer->read(&data.front(), decompressorBuffer->available()); + Send(Parent, new TEvS3Provider::TEvDecompressDataResult(std::move(data))); + } + } catch (const TDtorException&) { + // Stop any activity instantly + return; + } catch (...) { + Send(Parent, new TEvS3Provider::TEvDecompressDataResult(std::current_exception())); + } + Send(Parent, new TEvS3Provider::TEvDecompressDataFinish()); + } + + void ProcessOneEvent() { + if (!Requests.empty()) { + ExtractDataPart(*Requests.front()); + Requests.pop(); + return; + } + TAutoPtr<::NActors::IEventHandle> ev(WaitForEvent().Release()); + StateFunc(ev); + } + + void ExtractDataPart(TEvS3Provider::TEvDecompressDataRequest& event) { + InputBuffer = std::move(event.Data); + } + +private: + TString InputBuffer; + TString Compression; + TActorId Parent; + bool InputFinished = false; + std::queue> Requests; +}; + +class TS3DecompressorCoroActor : public TActorCoro { +public: + TS3DecompressorCoroActor(THolder impl) + : TActorCoro(THolder(impl.Release())) + {} +private: + void Registered(TActorSystem* actorSystem, const TActorId& parent) override { + TActorCoro::Registered(actorSystem, parent); // Calls TActorCoro::OnRegister and sends bootstrap event to ourself. + } +}; + +} + +NActors::IActor* CreateS3DecompressorActor(const NActors::TActorId& parent, const TString& compression) { + return new TS3DecompressorCoroActor(MakeHolder(parent, compression)); +} + +} // namespace NYql::NDq diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.h b/ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.h new file mode 100644 index 000000000000..f736e0c6c7ef --- /dev/null +++ b/ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.h @@ -0,0 +1,9 @@ +#pragma once + +#include + +namespace NYql::NDq { + +NActors::IActor* CreateS3DecompressorActor(const NActors::TActorId& parent, const TString& compression); + +} // namespace NYql::NDq diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp index 81ec94b2d51a..6c37dcad4a47 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp @@ -38,6 +38,7 @@ #include "yql_arrow_column_converters.h" #include "yql_arrow_push_down.h" +#include "yql_s3_decompressor_actor.h" #include "yql_s3_actors_util.h" #include "yql_s3_raw_read_actor.h" #include "yql_s3_read_actor.h" @@ -364,22 +365,49 @@ class TS3ReadCoroImpl : public TActorCoroImpl { TString RawDataBuffer; }; + class TCoroDecompressorBuffer : public NDB::ReadBuffer { + public: + TCoroDecompressorBuffer(TS3ReadCoroImpl* coro) + : NDB::ReadBuffer(nullptr, 0ULL) + , Coro(coro) + { } + private: + bool nextImpl() final { + while (!Coro->DecompressedInputFinished || !Coro->DeferredDecompressedDataParts.empty()) { + Coro->CpuTime += Coro->GetCpuTimeDelta(); + Coro->ProcessOneEvent(); + Coro->StartCycleCount = GetCycleCountFast(); + auto decompressed = Coro->ExtractDecompressedDataPart(); + if (decompressed) { + RawDataBuffer.swap(decompressed); + auto rawData = const_cast(RawDataBuffer.data()); + working_buffer = NDB::BufferBase::Buffer(rawData, rawData + RawDataBuffer.size()); + return true; + } else if (Coro->InputBuffer) { + Coro->Send(Coro->DecompressorActorId, new TEvS3Provider::TEvDecompressDataRequest(std::move(Coro->InputBuffer))); + } + } + return false; + } + TS3ReadCoroImpl *const Coro; + TString RawDataBuffer; + }; + void RunClickHouseParserOverHttp() { LOG_CORO_D("RunClickHouseParserOverHttp"); - std::unique_ptr coroBuffer = std::make_unique(this); + std::unique_ptr coroBuffer = AsyncDecompressing ? std::unique_ptr(std::make_unique(this)) : std::unique_ptr(std::make_unique(this)); std::unique_ptr decompressorBuffer; NDB::ReadBuffer* buffer = coroBuffer.get(); // lz4 decompressor reads signature in ctor, w/o actual data it will be deadlocked DownloadStart(RetryStuff, GetActorSystem(), SelfActorId, ParentActorId, PathIndex, HttpInflightSize); - if (ReadSpec->Compression) { + if (ReadSpec->Compression && !AsyncDecompressing) { decompressorBuffer = MakeDecompressor(*buffer, ReadSpec->Compression); YQL_ENSURE(decompressorBuffer, "Unsupported " << ReadSpec->Compression << " compression."); buffer = decompressorBuffer.get(); - } auto stream = std::make_unique( @@ -413,11 +441,11 @@ class TS3ReadCoroImpl : public TActorCoroImpl { TString fileName = Url.substr(7) + Path; - std::unique_ptr coroBuffer = std::make_unique(fileName); + std::unique_ptr coroBuffer = AsyncDecompressing ? std::unique_ptr(std::make_unique(this)) : std::unique_ptr(std::make_unique(fileName)); std::unique_ptr decompressorBuffer; NDB::ReadBuffer* buffer = coroBuffer.get(); - if (ReadSpec->Compression) { + if (ReadSpec->Compression && !AsyncDecompressing) { decompressorBuffer = MakeDecompressor(*buffer, ReadSpec->Compression); YQL_ENSURE(decompressorBuffer, "Unsupported " << ReadSpec->Compression << " compression."); buffer = decompressorBuffer.get(); @@ -840,6 +868,8 @@ class TS3ReadCoroImpl : public TActorCoroImpl { hFunc(TEvS3Provider::TEvDownloadStart, Handle); hFunc(TEvS3Provider::TEvDownloadData, Handle); hFunc(TEvS3Provider::TEvDownloadFinish, Handle); + hFunc(TEvS3Provider::TEvDecompressDataResult, Handle); + hFunc(TEvS3Provider::TEvDecompressDataFinish, Handle); hFunc(TEvS3Provider::TEvContinue, Handle); hFunc(TEvS3Provider::TEvReadResult2, Handle); hFunc(NActors::TEvents::TEvPoison, Handle); @@ -869,6 +899,18 @@ class TS3ReadCoroImpl : public TActorCoroImpl { Send(ComputeActorId, new IDqComputeActorAsyncInput::TEvNewAsyncInputDataArrived(InputIndex)); } + TString ExtractDecompressedDataPart() { + if (!DeferredDecompressedDataParts.empty()) { + auto result = std::move(DeferredDecompressedDataParts.front()); + DeferredDecompressedDataParts.pop(); + if (result->Exception) { + throw result->Exception; + } + return result->Data; + } + return {}; + } + void Handle(TEvS3Provider::TEvDownloadStart::TPtr& ev) { HttpResponseCode = ev->Get()->HttpResponseCode; CurlResponseCode = ev->Get()->CurlResponseCode; @@ -898,6 +940,14 @@ class TS3ReadCoroImpl : public TActorCoroImpl { } } + void Handle(TEvS3Provider::TEvDecompressDataResult::TPtr& ev) { + DeferredDecompressedDataParts.push(std::move(ev->Release())); + } + + void Handle(TEvS3Provider::TEvDecompressDataFinish::TPtr&) { + DecompressedInputFinished = true; + } + void Handle(TEvS3Provider::TEvDownloadFinish::TPtr& ev) { if (CurlResponseCode == CURLE_OK) { @@ -933,6 +983,7 @@ class TS3ReadCoroImpl : public TActorCoroImpl { // can't retry here: fail download RetryStuff->RetryState = nullptr; InputFinished = true; + FinishDecompressor(); LOG_CORO_W("ReadError: " << Issues.ToOneLineString() << ", LastOffset: " << LastOffset << ", LastData: " << GetLastDataAsText()); throw TS3ReadError(); // Don't pass control to data parsing, because it may validate eof and show wrong issues about incorrect data format } @@ -951,6 +1002,7 @@ class TS3ReadCoroImpl : public TActorCoroImpl { } else { LOG_CORO_D("TEvDownloadFinish, LastOffset: " << LastOffset << ", Error: " << ServerReturnedError); InputFinished = true; + FinishDecompressor(); if (ServerReturnedError) { throw TS3ReadError(); // Don't pass control to data parsing, because it may validate eof and show wrong issues about incorrect data format } @@ -973,9 +1025,16 @@ class TS3ReadCoroImpl : public TActorCoroImpl { void Handle(NActors::TEvents::TEvPoison::TPtr&) { LOG_CORO_D("TEvPoison"); RetryStuff->Cancel(); + FinishDecompressor(true); throw TS3ReadAbort(); } + void FinishDecompressor(bool force = false) { + if (AsyncDecompressing) { + Send(DecompressorActorId, new NActors::TEvents::TEvPoison(), 0, force); + } + } + private: static constexpr std::string_view TruncatedSuffix = "... [truncated]"sv; public: @@ -987,13 +1046,14 @@ class TS3ReadCoroImpl : public TActorCoroImpl { const ::NMonitoring::TDynamicCounters::TCounterPtr& deferredQueueSize, const ::NMonitoring::TDynamicCounters::TCounterPtr& httpInflightSize, const ::NMonitoring::TDynamicCounters::TCounterPtr& httpDataRps, - const ::NMonitoring::TDynamicCounters::TCounterPtr& rawInflightSize) + const ::NMonitoring::TDynamicCounters::TCounterPtr& rawInflightSize, + bool asyncDecompressing) : TActorCoroImpl(256_KB), ReadActorFactoryCfg(readActorFactoryCfg), InputIndex(inputIndex), TxId(txId), RetryStuff(retryStuff), ReadSpec(readSpec), ComputeActorId(computeActorId), PathIndex(pathIndex), Path(path), Url(url), RowsRemained(maxRows), SourceContext(queueBufferCounter), DeferredQueueSize(deferredQueueSize), HttpInflightSize(httpInflightSize), - HttpDataRps(httpDataRps), RawInflightSize(rawInflightSize) { + HttpDataRps(httpDataRps), RawInflightSize(rawInflightSize), AsyncDecompressing(asyncDecompressing) { } ~TS3ReadCoroImpl() override { @@ -1048,6 +1108,9 @@ class TS3ReadCoroImpl : public TActorCoroImpl { } void Run() final { + if (AsyncDecompressing) { + DecompressorActorId = Register(CreateS3DecompressorActor(SelfActorId, ReadSpec->Compression)); + } NYql::NDqProto::StatusIds::StatusCode fatalCode = NYql::NDqProto::StatusIds::EXTERNAL_ERROR; @@ -1158,12 +1221,14 @@ class TS3ReadCoroImpl : public TActorCoroImpl { const TString Url; bool InputFinished = false; + bool DecompressedInputFinished = false; long HttpResponseCode = 0L; CURLcode CurlResponseCode = CURLE_OK; bool ServerReturnedError = false; TString ErrorText; TIssues Issues; + NActors::TActorId DecompressorActorId; std::size_t LastOffset = 0; TString LastData; ui64 IngressBytes = 0; @@ -1174,11 +1239,13 @@ class TS3ReadCoroImpl : public TActorCoroImpl { std::optional RowsRemained; bool Paused = false; std::queue> DeferredDataParts; + std::queue> DeferredDecompressedDataParts; TSourceContext::TPtr SourceContext; const ::NMonitoring::TDynamicCounters::TCounterPtr DeferredQueueSize; const ::NMonitoring::TDynamicCounters::TCounterPtr HttpInflightSize; const ::NMonitoring::TDynamicCounters::TCounterPtr HttpDataRps; const ::NMonitoring::TDynamicCounters::TCounterPtr RawInflightSize; + const bool AsyncDecompressing; }; class TS3ReadCoroActor : public TActorCoro { @@ -1221,7 +1288,8 @@ class TS3StreamReadActor : public TActorBootstrapped, public ui64 fileQueueBatchSizeLimit, ui64 fileQueueBatchObjectCountLimit, ui64 fileQueueConsumersCountDelta, - bool asyncDecoding + bool asyncDecoding, + bool asyncDecompressing ) : ReadActorFactoryCfg(readActorFactoryCfg) , Gateway(std::move(gateway)) , HolderFactory(holderFactory) @@ -1247,7 +1315,8 @@ class TS3StreamReadActor : public TActorBootstrapped, public , FileQueueBatchSizeLimit(fileQueueBatchSizeLimit) , FileQueueBatchObjectCountLimit(fileQueueBatchObjectCountLimit) , FileQueueConsumersCountDelta(fileQueueConsumersCountDelta) - , AsyncDecoding(asyncDecoding) { + , AsyncDecoding(asyncDecoding) + , AsyncDecompressing(asyncDecompressing) { if (Counters) { QueueDataSize = Counters->GetCounter("QueueDataSize"); QueueDataLimit = Counters->GetCounter("QueueDataLimit"); @@ -1409,7 +1478,8 @@ class TS3StreamReadActor : public TActorBootstrapped, public DeferredQueueSize, HttpInflightSize, HttpDataRps, - RawInflightSize + RawInflightSize, + AsyncDecompressing ); if (AsyncDecoding) { actorId = Register(new TS3ReadCoroActor(std::move(impl))); @@ -1830,6 +1900,7 @@ class TS3StreamReadActor : public TActorBootstrapped, public ui64 FileQueueBatchObjectCountLimit; ui64 FileQueueConsumersCountDelta; const bool AsyncDecoding; + const bool AsyncDecompressing; bool IsCurrentBatchEmpty = false; bool IsFileQueueEmpty = false; bool IsWaitingFileQueueResponse = false; @@ -2183,7 +2254,7 @@ std::pair CreateS3ReadActor( std::move(paths), addPathIndex, readSpec, computeActorId, retryPolicy, cfg, counters, taskCounters, fileSizeLimit, sizeLimit, rowsLimitHint, memoryQuotaManager, params.GetUseRuntimeListing(), fileQueueActor, fileQueueBatchSizeLimit, fileQueueBatchObjectCountLimit, fileQueueConsumersCountDelta, - params.GetAsyncDecoding()); + params.GetAsyncDecoding(), params.GetAsyncDecompressing()); return {actor, actor}; } else { diff --git a/ydb/library/yql/providers/s3/compressors/lz4io.cpp b/ydb/library/yql/providers/s3/compressors/lz4io.cpp index cae7cc1ab6a0..1beabc190f19 100644 --- a/ydb/library/yql/providers/s3/compressors/lz4io.cpp +++ b/ydb/library/yql/providers/s3/compressors/lz4io.cpp @@ -22,7 +22,7 @@ constexpr ui32 LegacyMagicNumber = 0x184C2102U; constexpr size_t LegacyBlockSize = 8_MB; constexpr size_t FrameMaxBlockSize = 4_MB; -void WriteLE32 (void* p, ui32 value32) +void WriteLE32(void* p, ui32 value32) { const auto dstPtr = static_cast(p); dstPtr[0] = (unsigned char)value32; @@ -31,7 +31,7 @@ void WriteLE32 (void* p, ui32 value32) dstPtr[3] = (unsigned char)(value32 >> 24U); } -ui32 ReadLE32 (const void* s) { +ui32 ReadLE32(const void* s) { const auto srcPtr = static_cast(s); ui32 value32 = srcPtr[0]; value32 += (ui32)srcPtr[1] << 8U; @@ -112,29 +112,35 @@ bool TReadBuffer::nextImpl() { } size_t TReadBuffer::DecompressFrame() { - if (NextToLoad > InBuffer.size()) { - InBuffer.resize(NextToLoad); - } - - if (Pos >= Remaining) { - for (auto toRead = NextToLoad; toRead > 0U;) { - const auto sizeCheck = Source.read(InBuffer.data() + NextToLoad - toRead, toRead); - YQL_ENSURE(sizeCheck > 0U && sizeCheck <= toRead, "Cannot access compressed block."); - toRead -= sizeCheck; + while (NextToLoad) { + if (NextToLoad > InBuffer.size()) { + InBuffer.resize(NextToLoad); } - Pos = 0ULL; - Remaining = NextToLoad; - } + if (Pos >= Remaining) { + for (auto toRead = NextToLoad; toRead > 0U;) { + const auto sizeCheck = Source.read(InBuffer.data() + NextToLoad - toRead, toRead); + YQL_ENSURE(sizeCheck > 0U && sizeCheck <= toRead, "Cannot access compressed block."); + toRead -= sizeCheck; + } + + Pos = 0ULL; + Remaining = NextToLoad; + } - if (Pos < Remaining) { auto decodedBytes = OutBuffer.size(); - NextToLoad = LZ4F_decompress_usingDict(Ctx, OutBuffer.data(), &decodedBytes, InBuffer.data() + Pos, &Remaining, nullptr, 0ULL, nullptr); - YQL_ENSURE(!LZ4F_isError(NextToLoad), "Decompression error: " << LZ4F_getErrorName(NextToLoad)); - Pos += Remaining; + while (Pos < Remaining || (decodedBytes == OutBuffer.size())) { + decodedBytes = OutBuffer.size(); + NextToLoad = LZ4F_decompress_usingDict(Ctx, OutBuffer.data(), &decodedBytes, InBuffer.data() + Pos, &Remaining, nullptr, 0ULL, nullptr); + YQL_ENSURE(!LZ4F_isError(NextToLoad), "Decompression error: " << LZ4F_getErrorName(NextToLoad)); + Pos += Remaining; + + if (decodedBytes) + return decodedBytes; - if (decodedBytes) - return decodedBytes; + if (!NextToLoad) + return decodedBytes; + } } return 0ULL; diff --git a/ydb/library/yql/providers/s3/compressors/ut/decompressor_ut.cpp b/ydb/library/yql/providers/s3/compressors/ut/decompressor_ut.cpp new file mode 100644 index 000000000000..020401ae7f56 --- /dev/null +++ b/ydb/library/yql/providers/s3/compressors/ut/decompressor_ut.cpp @@ -0,0 +1,45 @@ +#include +#include + +#include +#include +#include + +namespace NYql::NCompressors { + +namespace { + TString GetResourcePath(const TString& path) { + return ArcadiaFromCurrentLocation(__SOURCE_FILE__, "test_compression_data/" + path); + } +} + +Y_UNIT_TEST_SUITE(TCompressorTests) { + Y_UNIT_TEST(SuccessLz4) { + NDB::ReadBufferFromFile buffer(GetResourcePath("test.json.lz4")); + auto decompressorBuffer = std::make_unique(buffer); + + char str[256] = {}; + decompressorBuffer->read(str, 256); + UNIT_ASSERT_VALUES_EQUAL(NSc::TValue::FromJsonThrow(str), NSc::TValue::FromJsonThrow(R"([ + { + "id": 0, + "description": "yq", + "info": "abc" + } + ])")); + } + + Y_UNIT_TEST(WrongMagicLz4) { + NDB::ReadBufferFromFile buffer(GetResourcePath("test.json")); + UNIT_ASSERT_EXCEPTION_CONTAINS(std::make_unique(buffer), yexception, "TReadBuffer(): requirement StreamType != EStreamType::Unknown failed, message: Wrong magic."); + } + + Y_UNIT_TEST(ErrorLz4) { + NDB::ReadBufferFromFile buffer(GetResourcePath("test.broken.lz4")); + auto decompressorBuffer = std::make_unique(buffer); + char str[256] = {}; + UNIT_ASSERT_EXCEPTION_CONTAINS(decompressorBuffer->read(str, 256), yexception, "DecompressFrame(): requirement !LZ4F_isError(NextToLoad) failed, message: Decompression error: ERROR_reservedFlag_set"); + } +} + +} diff --git a/ydb/library/yql/providers/s3/compressors/ut/test_compression_data/test.broken.lz4 b/ydb/library/yql/providers/s3/compressors/ut/test_compression_data/test.broken.lz4 new file mode 100644 index 0000000000000000000000000000000000000000..6ee4c1f3455b894f3d06b5ec01966083c6ec6243 GIT binary patch literal 79 zcmZQk@|8&Wzte_+f#DNlG*>m30uU%=rYKn{80hdad}dBbElw`VEGWs$&jX4nRTe5q fGB{-BrR57TEJ#dBR^qDVisj-0YG;Tl*wX_5S>F|G literal 0 HcmV?d00001 diff --git a/ydb/library/yql/providers/s3/compressors/ut/test_compression_data/test.json b/ydb/library/yql/providers/s3/compressors/ut/test_compression_data/test.json new file mode 100644 index 000000000000..7a403c671755 --- /dev/null +++ b/ydb/library/yql/providers/s3/compressors/ut/test_compression_data/test.json @@ -0,0 +1,8 @@ +[ +{ + "id": 0, + "description": "yq", + "info": "abc" +} +] + diff --git a/ydb/library/yql/providers/s3/compressors/ut/test_compression_data/test.json.lz4 b/ydb/library/yql/providers/s3/compressors/ut/test_compression_data/test.json.lz4 new file mode 100644 index 0000000000000000000000000000000000000000..f78d6a71ee174b0c6e35b68dd7b9fb2069d0637f GIT binary patch literal 79 zcmZQk@|8#_*lEMS!0?GNnyZ>i0SJ^bQ=K;l(Dhrh) f85}b6((;8E79=JmD{<9w#d2{0wKGH&?CAjjFo_i1 literal 0 HcmV?d00001 diff --git a/ydb/library/yql/providers/s3/compressors/ut/ya.make b/ydb/library/yql/providers/s3/compressors/ut/ya.make new file mode 100644 index 000000000000..40af15d146db --- /dev/null +++ b/ydb/library/yql/providers/s3/compressors/ut/ya.make @@ -0,0 +1,24 @@ +IF (NOT OS_WINDOWS AND CLANG AND NOT WITH_VALGRIND) + +UNITTEST_FOR(ydb/library/yql/providers/s3/compressors) + +SRCS( + decompressor_ut.cpp +) + +PEERDIR( + library/cpp/scheme + ydb/library/yql/public/udf/service/stub + ydb/library/yql/udfs/common/clickhouse/client +) + +ADDINCL( + ydb/library/yql/udfs/common/clickhouse/client/base + ydb/library/yql/udfs/common/clickhouse/client/base/pcg-random + ydb/library/yql/udfs/common/clickhouse/client/src +) + +END() + +ENDIF() + diff --git a/ydb/library/yql/providers/s3/compressors/ya.make b/ydb/library/yql/providers/s3/compressors/ya.make index bcf616858d12..9dc6e9b96f65 100644 --- a/ydb/library/yql/providers/s3/compressors/ya.make +++ b/ydb/library/yql/providers/s3/compressors/ya.make @@ -36,3 +36,6 @@ YQL_LAST_ABI_VERSION() END() +RECURSE_FOR_TESTS( + ut +) diff --git a/ydb/library/yql/providers/s3/events/events.h b/ydb/library/yql/providers/s3/events/events.h index 5472daa80723..b6baf5133650 100644 --- a/ydb/library/yql/providers/s3/events/events.h +++ b/ydb/library/yql/providers/s3/events/events.h @@ -2,6 +2,7 @@ #include +#include #include #include @@ -48,6 +49,10 @@ struct TEvS3Provider { EvCachePutRequest, EvCacheNotification, EvCacheSourceFinish, + // Decompressor events + EvDecompressDataRequest, + EvDecompressDataResult, + EvDecompressDataFinish, EvEnd }; static_assert(EvEnd < EventSpaceEnd(NKikimr::TKikimrEvents::ES_S3_PROVIDER), "expect EvEnd < EventSpaceEnd(TEvents::ES_S3_PROVIDER)"); @@ -196,6 +201,21 @@ struct TEvS3Provider { struct TEvContinue : public NActors::TEventLocal { }; + struct TEvDecompressDataRequest : public NActors::TEventLocal { + TEvDecompressDataRequest(TString&& data) : Data(std::move(data)) {} + TString Data; + }; + + struct TEvDecompressDataResult : public NActors::TEventLocal { + TEvDecompressDataResult(TString&& data) : Data(std::move(data)) {} + TEvDecompressDataResult(std::exception_ptr exception) : Exception(exception) {} + TString Data; + std::exception_ptr Exception; + }; + + struct TEvDecompressDataFinish : public NActors::TEventLocal { + }; + struct TReadRange { int64_t Offset; int64_t Length; diff --git a/ydb/library/yql/providers/s3/proto/source.proto b/ydb/library/yql/providers/s3/proto/source.proto index eaae53df3476..225a0720d762 100644 --- a/ydb/library/yql/providers/s3/proto/source.proto +++ b/ydb/library/yql/providers/s3/proto/source.proto @@ -26,4 +26,5 @@ message TSource { bool UseRuntimeListing = 13; bool AsyncDecoding = 14; NYql.NConnector.NApi.TPredicate Predicate = 15; + bool AsyncDecompressing = 16; } diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp index b90ada844c3b..47bcdf156cc5 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp @@ -437,6 +437,7 @@ class TS3DqIntegration: public TDqIntegrationBase { } srcDesc.SetAsyncDecoding(State_->Configuration->AsyncDecoding.Get().GetOrElse(false)); + srcDesc.SetAsyncDecompressing(State_->Configuration->AsyncDecompressing.Get().GetOrElse(false)); #if defined(_linux_) || defined(_darwin_) diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_settings.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_settings.cpp index 1bc0306fec30..cace7a62e1fc 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_settings.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_settings.cpp @@ -28,6 +28,7 @@ TS3Configuration::TS3Configuration() REGISTER_SETTING(*this, FileQueuePrefetchSize); REGISTER_SETTING(*this, AsyncDecoding); REGISTER_SETTING(*this, UsePredicatePushdown); + REGISTER_SETTING(*this, AsyncDecompressing); } TS3Settings::TConstPtr TS3Configuration::Snapshot() const { diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_settings.h b/ydb/library/yql/providers/s3/provider/yql_s3_settings.h index 172275bec7d4..6280d7ace419 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_settings.h +++ b/ydb/library/yql/providers/s3/provider/yql_s3_settings.h @@ -30,6 +30,7 @@ struct TS3Settings { NCommon::TConfSetting FileQueuePrefetchSize; NCommon::TConfSetting AsyncDecoding; // Parse and decode input data at separate mailbox/thread of TaskRunner NCommon::TConfSetting UsePredicatePushdown; + NCommon::TConfSetting AsyncDecompressing; // Decompression and parsing input data in different mailbox/thread }; struct TS3ClusterSettings { From 1f307a1a9e86d94ab8dd17bcfe4083d8de7adadf Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Wed, 24 Jul 2024 14:13:32 +0300 Subject: [PATCH 05/56] =?UTF-8?q?YDB=20FQ:=20avoid=20outdated=20syntax=20"?= =?UTF-8?q?SELECT=20*=20FROM=20cluster.db.table"=20(copy=20=E2=80=A6=20(#6?= =?UTF-8?q?971)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Grigorii Papashvili Co-authored-by: Vitaly Isaev --- .../generic/connector/libcpp/error.cpp | 2 ++ .../datasource/clickhouse/docker-compose.yml | 2 +- .../datasource/postgresql/docker-compose.yml | 2 +- .../tests/datasource/ydb/docker-compose.yml | 2 +- .../connector/tests/join/docker-compose.yml | 2 +- .../provider/yql_generic_cluster_config.cpp | 36 ++++++++++++++----- .../provider/yql_generic_dq_integration.cpp | 9 +---- .../provider/yql_generic_load_meta.cpp | 34 ++---------------- ydb/tests/fq/generic/docker-compose.yml | 2 +- 9 files changed, 37 insertions(+), 54 deletions(-) diff --git a/ydb/library/yql/providers/generic/connector/libcpp/error.cpp b/ydb/library/yql/providers/generic/connector/libcpp/error.cpp index 3f6bd3b7a448..a050e1f8ef27 100644 --- a/ydb/library/yql/providers/generic/connector/libcpp/error.cpp +++ b/ydb/library/yql/providers/generic/connector/libcpp/error.cpp @@ -37,6 +37,8 @@ namespace NYql::NConnector { return NDqProto::StatusIds::StatusCode::StatusIds_StatusCode_UNSUPPORTED; case ::Ydb::StatusIds::StatusCode::StatusIds_StatusCode_NOT_FOUND: return NDqProto::StatusIds::StatusCode::StatusIds_StatusCode_BAD_REQUEST; + case ::Ydb::StatusIds::StatusCode::StatusIds_StatusCode_SCHEME_ERROR: + return NDqProto::StatusIds::StatusCode::StatusIds_StatusCode_SCHEME_ERROR; default: ythrow yexception() << "Unexpected YDB status code: " << ::Ydb::StatusIds::StatusCode_Name(error.status()); } diff --git a/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml index afa9f3d4d3c2..dd58ce80b4df 100644 --- a/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml @@ -12,7 +12,7 @@ services: - 8123 fq-connector-go: container_name: fq-tests-ch-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.4.9@sha256:3a1fe086be50c0edbae2c2b284aee5ce76bd056d7f46cb460919ec37a6f8ab5c + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.0@sha256:6d3cec43478bef88dda195cd38c10e4df719c8ce6d13c9bd288c7ec40410e9d8 ports: - 2130 volumes: diff --git a/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml index 146f50dd07b0..de6e29ea8bf4 100644 --- a/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml @@ -1,7 +1,7 @@ services: fq-connector-go: container_name: fq-tests-pg-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.4.9@sha256:3a1fe086be50c0edbae2c2b284aee5ce76bd056d7f46cb460919ec37a6f8ab5c + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.0@sha256:6d3cec43478bef88dda195cd38c10e4df719c8ce6d13c9bd288c7ec40410e9d8 ports: - 2130 volumes: diff --git a/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml index 4b691f155d54..37a2dec71d1d 100644 --- a/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml @@ -5,7 +5,7 @@ services: echo \"$$(dig fq-tests-ydb-ydb +short) fq-tests-ydb-ydb\" >> /etc/hosts; cat /etc/hosts; /opt/ydb/bin/fq-connector-go server -c /opt/ydb/cfg/fq-connector-go.yaml" container_name: fq-tests-ydb-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.4.9@sha256:3a1fe086be50c0edbae2c2b284aee5ce76bd056d7f46cb460919ec37a6f8ab5c + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.0@sha256:6d3cec43478bef88dda195cd38c10e4df719c8ce6d13c9bd288c7ec40410e9d8 ports: - 2130 volumes: diff --git a/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml index efb69b50eee6..8383a480bf6f 100644 --- a/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml @@ -12,7 +12,7 @@ services: - 8123 fq-connector-go: container_name: fq-tests-join-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.4.9@sha256:3a1fe086be50c0edbae2c2b284aee5ce76bd056d7f46cb460919ec37a6f8ab5c + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.0@sha256:6d3cec43478bef88dda195cd38c10e4df719c8ce6d13c9bd288c7ec40410e9d8 ports: - 2130 volumes: diff --git a/ydb/library/yql/providers/generic/provider/yql_generic_cluster_config.cpp b/ydb/library/yql/providers/generic/provider/yql_generic_cluster_config.cpp index 0c0e769184c0..4f773517111c 100644 --- a/ydb/library/yql/providers/generic/provider/yql_generic_cluster_config.cpp +++ b/ydb/library/yql/providers/generic/provider/yql_generic_cluster_config.cpp @@ -107,14 +107,12 @@ namespace NYql { NYql::TGenericClusterConfig& clusterConfig) { auto it = properties.find("database_name"); if (it == properties.cend()) { - // TODO: make this property required during https://st.yandex-team.ru/YQ-2494 - // ythrow yexception() << "missing 'DATABASE_NAME' value"; + // DATABASE_NAME is a mandatory field for the most of databases, + // however, managed YDB does not require it, so we have to accept empty values here. return; } if (!it->second) { - // TODO: make this property required during https://st.yandex-team.ru/YQ-2494 - // ythrow yexception() << "invalid 'DATABASE_NAME' value: '" << it->second << "'"; return; } @@ -125,14 +123,12 @@ namespace NYql { NYql::TGenericClusterConfig& clusterConfig) { auto it = properties.find("schema"); if (it == properties.cend()) { - // TODO: make this property required during https://st.yandex-team.ru/YQ-2494 - // ythrow yexception() << "missing 'SCHEMA' value"; + // SCHEMA is optional field return; } if (!it->second) { - // TODO: make this property required during https://st.yandex-team.ru/YQ-2494 - // ythrow yexception() << "invalid 'SCHEMA' value: '" << it->second << "'"; + // SCHEMA is optional field return; } @@ -318,9 +314,20 @@ namespace NYql { } static const TSet managedDatabaseKinds{ + NConnector::NApi::EDataSourceKind::CLICKHOUSE, + NConnector::NApi::EDataSourceKind::GREENPLUM, + NConnector::NApi::EDataSourceKind::MYSQL, NConnector::NApi::EDataSourceKind::POSTGRESQL, + NConnector::NApi::EDataSourceKind::YDB, + }; + + static const TSet traditionalRelationalDatabaseKinds{ NConnector::NApi::EDataSourceKind::CLICKHOUSE, - NConnector::NApi::EDataSourceKind::YDB}; + NConnector::NApi::EDataSourceKind::GREENPLUM, + NConnector::NApi::EDataSourceKind::MS_SQL_SERVER, + NConnector::NApi::EDataSourceKind::MYSQL, + NConnector::NApi::EDataSourceKind::POSTGRESQL, + }; void ValidateGenericClusterConfig( const NYql::TGenericClusterConfig& clusterConfig, @@ -396,6 +403,17 @@ namespace NYql { } } + // All the databases with exception to managed YDB: + // * DATABASE_NAME is mandatory field + if (traditionalRelationalDatabaseKinds.contains(clusterConfig.GetKind())) { + if (!clusterConfig.GetDatabaseName()) { + return ValidationError( + clusterConfig, + context, + "You must provide database name explicitly"); + } + } + // check required fields if (!clusterConfig.GetName()) { return ValidationError(clusterConfig, context, "empty field 'Name'"); diff --git a/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp b/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp index 19d8a5694d21..24fab4c67637 100644 --- a/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp +++ b/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp @@ -264,13 +264,6 @@ namespace NYql { const auto& clusterConfig = State_->Configuration->ClusterNamesToClusterConfigs[clusterName]; const auto& endpoint = clusterConfig.endpoint(); - // for backward compability full path can be used (cluster_name.`db_name.table`) - // TODO: simplify during https://st.yandex-team.ru/YQ-2494 - TStringBuf db, dbTable; - if (!TStringBuf(table).TrySplit('.', db, dbTable)) { - dbTable = table; - } - YQL_CLOG(INFO, ProviderGeneric) << "Filling lookup source settings" << ": cluster: " << clusterName @@ -283,7 +276,7 @@ namespace NYql { } Generic::TLookupSource source; - source.set_table(TString(dbTable)); + source.set_table(table); *source.mutable_data_source_instance() = tableMeta.value()->DataSourceInstance; // Managed YDB supports access via IAM token. diff --git a/ydb/library/yql/providers/generic/provider/yql_generic_load_meta.cpp b/ydb/library/yql/providers/generic/provider/yql_generic_load_meta.cpp index a5becd6add88..994d1cc6e81d 100644 --- a/ydb/library/yql/providers/generic/provider/yql_generic_load_meta.cpp +++ b/ydb/library/yql/providers/generic/provider/yql_generic_load_meta.cpp @@ -367,38 +367,8 @@ namespace NYql { void FillTablePath(NConnector::NApi::TDescribeTableRequest& request, const TGenericClusterConfig& clusterConfig, const TString& tablePath) { - // for backward compability full path can be used (cluster_name.`db_name.table`) - // TODO: simplify during https://st.yandex-team.ru/YQ-2494 - const auto dataSourceKind = clusterConfig.GetKind(); - const auto& dbNameFromConfig = clusterConfig.GetDatabaseName(); - TStringBuf dbNameTarget, tableName; - auto isFullPath = TStringBuf(tablePath).TrySplit('.', dbNameTarget, tableName); - - if (!dbNameFromConfig.empty()) { - dbNameTarget = dbNameFromConfig; - if (!isFullPath) { - tableName = tablePath; - } - } else if (!isFullPath) { - tableName = tablePath; - switch (dataSourceKind) { - case NYql::NConnector::NApi::CLICKHOUSE: - dbNameTarget = "default"; - break; - case NYql::NConnector::NApi::POSTGRESQL: - dbNameTarget = "postgres"; - break; - case NYql::NConnector::NApi::MS_SQL_SERVER: - dbNameTarget = "mssqlserver"; - break; - default: - ythrow yexception() << "You must provide database name explicitly for data source kind: '" - << NYql::NConnector::NApi::EDataSourceKind_Name(dataSourceKind) << "'"; - } - } // else take database name from table path - - request.mutable_data_source_instance()->set_database(TString(dbNameTarget)); - request.set_table(TString(tableName)); + request.mutable_data_source_instance()->set_database(clusterConfig.GetDatabaseName()); + request.set_table(tablePath); } private: diff --git a/ydb/tests/fq/generic/docker-compose.yml b/ydb/tests/fq/generic/docker-compose.yml index 8b8d86f4ddb3..ce60fdfe7d63 100644 --- a/ydb/tests/fq/generic/docker-compose.yml +++ b/ydb/tests/fq/generic/docker-compose.yml @@ -15,7 +15,7 @@ services: echo \"$$(dig tests-fq-generic-ydb +short) tests-fq-generic-ydb\" >> /etc/hosts; cat /etc/hosts; /opt/ydb/bin/fq-connector-go server -c /opt/ydb/cfg/fq-connector-go.yaml" container_name: tests-fq-generic-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.4.9@sha256:3a1fe086be50c0edbae2c2b284aee5ce76bd056d7f46cb460919ec37a6f8ab5c + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.0@sha256:6d3cec43478bef88dda195cd38c10e4df719c8ce6d13c9bd288c7ec40410e9d8 ports: - "2130" postgresql: From affdcf4727f436c6bc0e008aff8f6b6af60a8f48 Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Mon, 29 Jul 2024 19:24:54 +0300 Subject: [PATCH 06/56] merge fq stable 2024 07 29 (#7193) --- ydb/core/fq/libs/actors/run_actor.cpp | 6 ++- ydb/core/fq/libs/init/init.cpp | 21 +++------ .../ydb_over_fq/execute_data_query.cpp | 5 +++ .../kqp/compute_actor/kqp_compute_actor.cpp | 2 +- .../kqp_federated_query_helpers.cpp | 5 ++- .../kqp_federated_query_helpers.h | 6 ++- ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp | 2 +- .../common/proto/gateways_config.proto | 4 ++ .../pq/async_io/dq_pq_read_actor.cpp | 45 +++++++++++++++++-- .../providers/pq/async_io/dq_pq_read_actor.h | 3 +- .../s3/actors/yql_s3_decompressor_actor.cpp | 22 +++++++-- .../providers/s3/actors/yql_s3_read_actor.cpp | 5 ++- .../actors_factory/yql_s3_actors_factory.cpp | 26 +++++++++++ .../s3/actors_factory/yql_s3_actors_factory.h | 1 + ydb/library/yql/providers/s3/events/events.h | 18 +++++++- .../s3/provider/yql_s3_dq_integration.cpp | 2 +- .../providers/s3/provider/yql_s3_settings.cpp | 1 + .../providers/s3/provider/yql_s3_settings.h | 2 + ydb/tests/fq/pq_async_io/ut_helpers.cpp | 1 + ydb/tests/fq/s3/conftest.py | 13 ++++-- ydb/tests/fq/s3/test_format_setting.py | 8 ++-- ydb/tests/fq/s3/test_ydb_over_fq.py | 28 ++++++++++++ ydb/tools/query_replay/query_compiler.cpp | 2 +- ydb/tools/query_replay_yt/query_compiler.cpp | 2 +- 24 files changed, 188 insertions(+), 42 deletions(-) diff --git a/ydb/core/fq/libs/actors/run_actor.cpp b/ydb/core/fq/libs/actors/run_actor.cpp index 29626a4ec054..f52beadd6735 100644 --- a/ydb/core/fq/libs/actors/run_actor.cpp +++ b/ydb/core/fq/libs/actors/run_actor.cpp @@ -779,7 +779,11 @@ class TRunActor : public NActors::TActorBootstrapped { mkqlDefaultLimit = 8_GB; } + // This part is for backward compatibility. TODO: remove this part after migration to TS3GatewayConfig auto s3ReadDefaultInflightLimit = Params.Config.GetReadActorsFactoryConfig().GetS3ReadActorFactoryConfig().GetDataInflight(); + if (s3ReadDefaultInflightLimit == 0) { + s3ReadDefaultInflightLimit = Params.Config.GetGateways().GetS3().GetDataInflight(); + } if (s3ReadDefaultInflightLimit == 0) { s3ReadDefaultInflightLimit = 200_MB; } @@ -1936,7 +1940,7 @@ class TRunActor : public NActors::TActorBootstrapped { { dataProvidersInit.push_back(GetS3DataProviderInitializer(Params.S3Gateway, Params.CredentialsFactory, - Params.Config.GetReadActorsFactoryConfig().GetS3ReadActorFactoryConfig().GetAllowLocalFiles())); + Params.Config.GetReadActorsFactoryConfig().HasS3ReadActorFactoryConfig() ? Params.Config.GetReadActorsFactoryConfig().GetS3ReadActorFactoryConfig().GetAllowLocalFiles() : Params.Config.GetGateways().GetS3().GetAllowLocalFiles())); // This part is for backward compatibility. TODO: remove this part after migration to TS3GatewayConfig } { diff --git a/ydb/core/fq/libs/init/init.cpp b/ydb/core/fq/libs/init/init.cpp index acd5518a98cf..46d9042f875b 100644 --- a/ydb/core/fq/libs/init/init.cpp +++ b/ydb/core/fq/libs/init/init.cpp @@ -198,7 +198,9 @@ void Init( if (protoConfig.GetPrivateApi().GetEnabled()) { const auto& s3readConfig = protoConfig.GetReadActorsFactoryConfig().GetS3ReadActorFactoryConfig(); auto s3HttpRetryPolicy = NYql::GetHTTPDefaultRetryPolicy(NYql::THttpRetryPolicyOptions{.MaxTime = TDuration::Max(), .RetriedCurlCodes = NYql::FqRetriedCurlCodes()}); - NYql::NDq::TS3ReadActorFactoryConfig readActorFactoryCfg; + NYql::NDq::TS3ReadActorFactoryConfig readActorFactoryCfg = NYql::NDq::CreateReadActorFactoryConfig(protoConfig.GetGateways().GetS3()); + + // These fillings were left for the backward compatibility. TODO: remove this part after migration to TS3GatewayConfig if (const ui64 rowsInBatch = s3readConfig.GetRowsInBatch()) { readActorFactoryCfg.RowsInBatch = rowsInBatch; } @@ -208,22 +210,9 @@ void Init( if (const ui64 dataInflight = s3readConfig.GetDataInflight()) { readActorFactoryCfg.DataInflight = dataInflight; } - for (auto& formatSizeLimit: protoConfig.GetGateways().GetS3().GetFormatSizeLimit()) { - if (formatSizeLimit.GetName()) { // ignore unnamed limits - readActorFactoryCfg.FormatSizeLimits.emplace( - formatSizeLimit.GetName(), formatSizeLimit.GetFileSizeLimit()); - } - } - if (protoConfig.GetGateways().GetS3().HasFileSizeLimit()) { - readActorFactoryCfg.FileSizeLimit = - protoConfig.GetGateways().GetS3().GetFileSizeLimit(); - } - if (protoConfig.GetGateways().GetS3().HasBlockFileSizeLimit()) { - readActorFactoryCfg.BlockFileSizeLimit = - protoConfig.GetGateways().GetS3().GetBlockFileSizeLimit(); - } + RegisterDqInputTransformLookupActorFactory(*asyncIoFactory); - RegisterDqPqReadActorFactory(*asyncIoFactory, yqSharedResources->UserSpaceYdbDriver, credentialsFactory); + RegisterDqPqReadActorFactory(*asyncIoFactory, yqSharedResources->UserSpaceYdbDriver, credentialsFactory, yqCounters->GetSubgroup("subsystem", "DqSourceTracker")); RegisterYdbReadActorFactory(*asyncIoFactory, yqSharedResources->UserSpaceYdbDriver, credentialsFactory); s3ActorsFactory->RegisterS3ReadActorFactory(*asyncIoFactory, credentialsFactory, httpGateway, s3HttpRetryPolicy, readActorFactoryCfg, diff --git a/ydb/core/grpc_services/ydb_over_fq/execute_data_query.cpp b/ydb/core/grpc_services/ydb_over_fq/execute_data_query.cpp index 2f6be1f7c7ab..02ae2be92030 100644 --- a/ydb/core/grpc_services/ydb_over_fq/execute_data_query.cpp +++ b/ydb/core/grpc_services/ydb_over_fq/execute_data_query.cpp @@ -127,6 +127,11 @@ class ExecuteDataQueryRPC ) void HandleResultSets(const TString& queryId, const TActorContext& ctx) { + if (ResultSetSizes_.empty()) { + SendReply(ctx); + return; + } + Become(&ExecuteDataQueryRPC::GatherResultSetsState); QueryId_ = queryId; MakeLocalCall(CreateResultSetRequest(queryId, 0, 0), ctx); diff --git a/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp b/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp index 74b0f6f14f79..f4528f079ed1 100644 --- a/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp +++ b/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp @@ -78,7 +78,7 @@ NYql::NDq::IDqAsyncIoFactory::TPtr CreateKqpAsyncIoFactory( if (federatedQuerySetup) { auto s3HttpRetryPolicy = NYql::GetHTTPDefaultRetryPolicy(NYql::THttpRetryPolicyOptions{.RetriedCurlCodes = NYql::FqRetriedCurlCodes()}); - s3ActorsFactory->RegisterS3ReadActorFactory(*factory, federatedQuerySetup->CredentialsFactory, federatedQuerySetup->HttpGateway, s3HttpRetryPolicy); + s3ActorsFactory->RegisterS3ReadActorFactory(*factory, federatedQuerySetup->CredentialsFactory, federatedQuerySetup->HttpGateway, s3HttpRetryPolicy, federatedQuerySetup->S3ReadActorFactoryConfig); s3ActorsFactory->RegisterS3WriteActorFactory(*factory, federatedQuerySetup->CredentialsFactory, federatedQuerySetup->HttpGateway, s3HttpRetryPolicy); if (federatedQuerySetup->ConnectorClient) { diff --git a/ydb/core/kqp/federated_query/kqp_federated_query_helpers.cpp b/ydb/core/kqp/federated_query/kqp_federated_query_helpers.cpp index 18986fb7cf41..f767e5f131ed 100644 --- a/ydb/core/kqp/federated_query/kqp_federated_query_helpers.cpp +++ b/ydb/core/kqp/federated_query/kqp_federated_query_helpers.cpp @@ -72,6 +72,8 @@ namespace NKikimr::NKqp { S3GatewayConfig = queryServiceConfig.GetS3(); + S3ReadActorFactoryConfig = NYql::NDq::CreateReadActorFactoryConfig(S3GatewayConfig); + YtGatewayConfig = queryServiceConfig.GetYt(); YtGateway = MakeYtGateway(appData->FunctionRegistry, queryServiceConfig); @@ -127,7 +129,8 @@ namespace NKikimr::NKqp { GenericGatewaysConfig, YtGatewayConfig, YtGateway, - nullptr}; + nullptr, + S3ReadActorFactoryConfig}; // Init DatabaseAsyncResolver only if all requirements are met if (DatabaseResolverActorId && MdbEndpointGenerator && diff --git a/ydb/core/kqp/federated_query/kqp_federated_query_helpers.h b/ydb/core/kqp/federated_query/kqp_federated_query_helpers.h index 678de407f43f..b809fb4be581 100644 --- a/ydb/core/kqp/federated_query/kqp_federated_query_helpers.h +++ b/ydb/core/kqp/federated_query/kqp_federated_query_helpers.h @@ -9,6 +9,7 @@ #include #include #include +#include #include namespace NKikimrConfig { @@ -30,6 +31,7 @@ namespace NKikimr::NKqp { NYql::TYtGatewayConfig YtGatewayConfig; NYql::IYtGateway::TPtr YtGateway; NMiniKQL::TComputationNodeFactory ComputationFactory; + NYql::NDq::TS3ReadActorFactoryConfig S3ReadActorFactoryConfig; }; struct IKqpFederatedQuerySetupFactory { @@ -65,6 +67,7 @@ namespace NKikimr::NKqp { NYql::NConnector::IClient::TPtr ConnectorClient; std::optional DatabaseResolverActorId; NYql::IMdbEndpointGenerator::TPtr MdbEndpointGenerator; + NYql::NDq::TS3ReadActorFactoryConfig S3ReadActorFactoryConfig; }; struct TKqpFederatedQuerySetupFactoryMock: public IKqpFederatedQuerySetupFactory { @@ -94,7 +97,7 @@ namespace NKikimr::NKqp { std::optional Make(NActors::TActorSystem*) override { return TKqpFederatedQuerySetup{ - HttpGateway, ConnectorClient, CredentialsFactory, DatabaseAsyncResolver, S3GatewayConfig, GenericGatewayConfig, YtGatewayConfig, YtGateway, ComputationFactories}; + HttpGateway, ConnectorClient, CredentialsFactory, DatabaseAsyncResolver, S3GatewayConfig, GenericGatewayConfig, YtGatewayConfig, YtGateway, ComputationFactories, S3ReadActorFactoryConfig}; } private: @@ -107,6 +110,7 @@ namespace NKikimr::NKqp { NYql::TYtGatewayConfig YtGatewayConfig; NYql::IYtGateway::TPtr YtGateway; NMiniKQL::TComputationNodeFactory ComputationFactories; + NYql::NDq::TS3ReadActorFactoryConfig S3ReadActorFactoryConfig; }; IKqpFederatedQuerySetupFactory::TPtr MakeKqpFederatedQuerySetupFactory( diff --git a/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp b/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp index 0903b929a4b5..4a349c0e5227 100644 --- a/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp +++ b/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp @@ -53,7 +53,7 @@ TIntrusivePtr CreateKikimrQueryProcessor(TIntrusivePtr ga UNIT_ASSERT(TryParseFromTextFormat(defaultSettingsStream, defaultSettings)); kikimrConfig->Init(defaultSettings.GetDefaultSettings(), cluster, settings, true); - auto federatedQuerySetup = std::make_optional({NYql::IHTTPGateway::Make(), nullptr, nullptr, nullptr, {}, {}, {}, nullptr, nullptr}); + auto federatedQuerySetup = std::make_optional({NYql::IHTTPGateway::Make(), nullptr, nullptr, nullptr, {}, {}, {}, nullptr, nullptr, {}}); return NKqp::CreateKqpHost(gateway, cluster, "/Root", kikimrConfig, moduleResolver, federatedQuerySetup, nullptr, nullptr, {}, funcRegistry, funcRegistry, keepConfigChanges, nullptr, actorSystem); } diff --git a/ydb/library/yql/providers/common/proto/gateways_config.proto b/ydb/library/yql/providers/common/proto/gateways_config.proto index 01eb25f2dcfa..61d9082f528f 100644 --- a/ydb/library/yql/providers/common/proto/gateways_config.proto +++ b/ydb/library/yql/providers/common/proto/gateways_config.proto @@ -398,6 +398,10 @@ message TS3GatewayConfig { optional uint64 RegexpCacheSize = 14; optional uint64 GeneratorPathsLimit = 15; optional uint64 MaxListingResultSizePerPartition = 16; + optional uint64 RowsInBatch = 17; // Default = 1000 + optional uint64 MaxInflight = 18; // Default = 20 + optional uint64 DataInflight = 19; // Default = 200 MB + optional bool AllowLocalFiles = 20; repeated TAttr DefaultSettings = 100; } diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp index 54f8913bfb9d..112187cd72ba 100644 --- a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp @@ -85,6 +85,30 @@ struct TEvPrivate { } // namespace class TDqPqReadActor : public NActors::TActor, public IDqComputeActorAsyncInput { + struct TMetrics { + TMetrics(const TTxId& txId, ui64 taskId, const ::NMonitoring::TDynamicCounterPtr& counters) + : TxId(std::visit([](auto arg) { return ToString(arg); }, txId)) + , Counters(counters) { + SubGroup = Counters->GetSubgroup("sink", "PqRead"); + auto sink = SubGroup->GetSubgroup("tx_id", TxId); + auto task = sink->GetSubgroup("task_id", ToString(taskId)); + InFlyAsyncInputData = task->GetCounter("InFlyAsyncInputData"); + InFlySubscribe = task->GetCounter("InFlySubscribe"); + AsyncInputDataRate = task->GetCounter("AsyncInputDataRate", true); + } + + ~TMetrics() { + SubGroup->RemoveSubgroup("id", TxId); + } + + TString TxId; + ::NMonitoring::TDynamicCounterPtr Counters; + ::NMonitoring::TDynamicCounterPtr SubGroup; + ::NMonitoring::TDynamicCounters::TCounterPtr InFlyAsyncInputData; + ::NMonitoring::TDynamicCounters::TCounterPtr InFlySubscribe; + ::NMonitoring::TDynamicCounters::TCounterPtr AsyncInputDataRate; + }; + public: using TPartitionKey = std::pair; // Cluster, partition id. using TDebugOffsets = TMaybe>; @@ -100,10 +124,12 @@ class TDqPqReadActor : public NActors::TActor, public IDqCompute NYdb::TDriver driver, std::shared_ptr credentialsProviderFactory, const NActors::TActorId& computeActorId, + const ::NMonitoring::TDynamicCounterPtr& counters, i64 bufferSize) : TActor(&TDqPqReadActor::StateFunc) , InputIndex(inputIndex) , TxId(txId) + , Metrics(txId, taskId, counters) , BufferSize(bufferSize) , HolderFactory(holderFactory) , LogPrefix(TStringBuilder() << "SelfId: " << this->SelfId() << ", TxId: " << TxId << ", task: " << taskId << ". PQ source. ") @@ -245,9 +271,14 @@ class TDqPqReadActor : public NActors::TActor, public IDqCompute hFunc(TEvPrivate::TEvSourceDataReady, Handle); ) - void Handle(TEvPrivate::TEvSourceDataReady::TPtr&) { + void Handle(TEvPrivate::TEvSourceDataReady::TPtr& ev) { SRC_LOG_T("SessionId: " << GetSessionId() << " Source data ready"); SubscribedOnEvent = false; + if (ev.Get()->Cookie) { + Metrics.InFlySubscribe->Dec(); + } + Metrics.InFlyAsyncInputData->Set(1); + Metrics.AsyncInputDataRate->Inc(); Send(ComputeActorId, new TEvNewAsyncInputDataArrived(InputIndex)); } @@ -282,6 +313,7 @@ class TDqPqReadActor : public NActors::TActor, public IDqCompute } i64 GetAsyncInputData(NKikimr::NMiniKQL::TUnboxedValueBatch& buffer, TMaybe& watermark, bool&, i64 freeSpace) override { + Metrics.InFlyAsyncInputData->Set(0); SRC_LOG_T("SessionId: " << GetSessionId() << " GetAsyncInputData freeSpace = " << freeSpace); const auto now = TInstant::Now(); @@ -387,9 +419,10 @@ class TDqPqReadActor : public NActors::TActor, public IDqCompute void SubscribeOnNextEvent() { if (!SubscribedOnEvent) { SubscribedOnEvent = true; + Metrics.InFlySubscribe->Inc(); NActors::TActorSystem* actorSystem = NActors::TActivationContext::ActorSystem(); EventFuture = GetReadSession().WaitEvent().Subscribe([actorSystem, selfId = SelfId()](const auto&){ - actorSystem->Send(selfId, new TEvPrivate::TEvSourceDataReady()); + actorSystem->Send(selfId, new TEvPrivate::TEvSourceDataReady(), 0, 1); }); } } @@ -595,6 +628,7 @@ class TDqPqReadActor : public NActors::TActor, public IDqCompute const ui64 InputIndex; TDqAsyncStats IngressStats; const TTxId TxId; + TMetrics Metrics; const i64 BufferSize; const THolderFactory& HolderFactory; const TString LogPrefix; @@ -629,6 +663,7 @@ std::pair CreateDqPqReadActor( ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const NActors::TActorId& computeActorId, const NKikimr::NMiniKQL::THolderFactory& holderFactory, + const ::NMonitoring::TDynamicCounterPtr& counters, i64 bufferSize ) { @@ -653,15 +688,16 @@ std::pair CreateDqPqReadActor( std::move(driver), CreateCredentialsProviderFactoryForStructuredToken(credentialsFactory, token, addBearerToToken), computeActorId, + counters, bufferSize ); return {actor, actor}; } -void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driver, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory) { +void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driver, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const ::NMonitoring::TDynamicCounterPtr& counters) { factory.RegisterSource("PqSource", - [driver = std::move(driver), credentialsFactory = std::move(credentialsFactory)]( + [driver = std::move(driver), credentialsFactory = std::move(credentialsFactory), counters]( NPq::NProto::TDqPqTopicSource&& settings, IDqAsyncIoFactory::TSourceArguments&& args) { @@ -678,6 +714,7 @@ void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driv credentialsFactory, args.ComputeActorId, args.HolderFactory, + counters, PQReadDefaultFreeSpace); }); diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.h b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.h index ec6e4e169110..161e9e5eba57 100644 --- a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.h +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.h @@ -34,9 +34,10 @@ std::pair CreateDqPqReadActor( ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const NActors::TActorId& computeActorId, const NKikimr::NMiniKQL::THolderFactory& holderFactory, + const ::NMonitoring::TDynamicCounterPtr& counters, i64 bufferSize = PQReadDefaultFreeSpace ); -void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driver, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory); +void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driver, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const ::NMonitoring::TDynamicCounterPtr& counters = MakeIntrusive<::NMonitoring::TDynamicCounters>()); } // namespace NYql::NDq diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.cpp index 738ba1f6fb7c..5d56383c7475 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_decompressor_actor.cpp @@ -33,7 +33,9 @@ class TS3DecompressorCoroImpl : public TActorCoroImpl { private: bool nextImpl() final { while (!Coro->InputFinished || !Coro->Requests.empty()) { + Coro->CpuTime += Coro->GetCpuTimeDelta(); Coro->ProcessOneEvent(); + Coro->StartCycleCount = GetCycleCountFast(); if (Coro->InputBuffer) { RawDataBuffer.swap(Coro->InputBuffer); Coro->InputBuffer.clear(); @@ -65,6 +67,8 @@ class TS3DecompressorCoroImpl : public TActorCoroImpl { } void Run() final { + StartCycleCount = GetCycleCountFast(); + try { std::unique_ptr coroBuffer = std::make_unique(this); NDB::ReadBuffer* buffer = coroBuffer.get(); @@ -74,15 +78,15 @@ class TS3DecompressorCoroImpl : public TActorCoroImpl { decompressorBuffer->nextIfAtEnd(); TString data{decompressorBuffer->available(), ' '}; decompressorBuffer->read(&data.front(), decompressorBuffer->available()); - Send(Parent, new TEvS3Provider::TEvDecompressDataResult(std::move(data))); + Send(Parent, new TEvS3Provider::TEvDecompressDataResult(std::move(data), TakeCpuTimeDelta())); } } catch (const TDtorException&) { // Stop any activity instantly return; } catch (...) { - Send(Parent, new TEvS3Provider::TEvDecompressDataResult(std::current_exception())); + Send(Parent, new TEvS3Provider::TEvDecompressDataResult(std::current_exception(), TakeCpuTimeDelta())); } - Send(Parent, new TEvS3Provider::TEvDecompressDataFinish()); + Send(Parent, new TEvS3Provider::TEvDecompressDataFinish(TakeCpuTimeDelta())); } void ProcessOneEvent() { @@ -99,7 +103,19 @@ class TS3DecompressorCoroImpl : public TActorCoroImpl { InputBuffer = std::move(event.Data); } + TDuration GetCpuTimeDelta() { + return TDuration::Seconds(NHPTimer::GetSeconds(GetCycleCountFast() - StartCycleCount)); + } + + TDuration TakeCpuTimeDelta() { + auto currentCpuTime = CpuTime; + CpuTime = TDuration::Zero(); + return currentCpuTime; + } + private: + TDuration CpuTime; + ui64 StartCycleCount = 0; TString InputBuffer; TString Compression; TActorId Parent; diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp index 6c37dcad4a47..de1fcc7a1b4d 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp @@ -941,10 +941,13 @@ class TS3ReadCoroImpl : public TActorCoroImpl { } void Handle(TEvS3Provider::TEvDecompressDataResult::TPtr& ev) { + CpuTime += ev->Get()->CpuTime; DeferredDecompressedDataParts.push(std::move(ev->Release())); + } - void Handle(TEvS3Provider::TEvDecompressDataFinish::TPtr&) { + void Handle(TEvS3Provider::TEvDecompressDataFinish::TPtr& ev) { + CpuTime += ev->Get()->CpuTime; DecompressedInputFinished = true; } diff --git a/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.cpp b/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.cpp index c925f788d4e1..3b14babb1dc3 100644 --- a/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.cpp +++ b/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.cpp @@ -59,4 +59,30 @@ namespace NYql::NDq { std::shared_ptr CreateDefaultS3ActorsFactory() { return std::make_shared(); } + + TS3ReadActorFactoryConfig CreateReadActorFactoryConfig(const ::NYql::TS3GatewayConfig& s3Config) { + TS3ReadActorFactoryConfig s3ReadActoryConfig; + if (const ui64 rowsInBatch = s3Config.GetRowsInBatch()) { + s3ReadActoryConfig.RowsInBatch = rowsInBatch; + } + if (const ui64 maxInflight = s3Config.GetMaxInflight()) { + s3ReadActoryConfig.MaxInflight = maxInflight; + } + if (const ui64 dataInflight = s3Config.GetDataInflight()) { + s3ReadActoryConfig.DataInflight = dataInflight; + } + for (auto& formatSizeLimit: s3Config.GetFormatSizeLimit()) { + if (formatSizeLimit.GetName()) { // ignore unnamed limits + s3ReadActoryConfig.FormatSizeLimits.emplace( + formatSizeLimit.GetName(), formatSizeLimit.GetFileSizeLimit()); + } + } + if (s3Config.HasFileSizeLimit()) { + s3ReadActoryConfig.FileSizeLimit = s3Config.GetFileSizeLimit(); + } + if (s3Config.HasBlockFileSizeLimit()) { + s3ReadActoryConfig.BlockFileSizeLimit = s3Config.GetBlockFileSizeLimit(); + } + return s3ReadActoryConfig; + } } diff --git a/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.h b/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.h index 7bb632569075..a7c55e50348b 100644 --- a/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.h +++ b/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.h @@ -53,4 +53,5 @@ namespace NYql::NDq { std::shared_ptr CreateDefaultS3ActorsFactory(); + TS3ReadActorFactoryConfig CreateReadActorFactoryConfig(const ::NYql::TS3GatewayConfig& s3Config); } diff --git a/ydb/library/yql/providers/s3/events/events.h b/ydb/library/yql/providers/s3/events/events.h index b6baf5133650..8101d54deb0d 100644 --- a/ydb/library/yql/providers/s3/events/events.h +++ b/ydb/library/yql/providers/s3/events/events.h @@ -207,13 +207,27 @@ struct TEvS3Provider { }; struct TEvDecompressDataResult : public NActors::TEventLocal { - TEvDecompressDataResult(TString&& data) : Data(std::move(data)) {} - TEvDecompressDataResult(std::exception_ptr exception) : Exception(exception) {} + TEvDecompressDataResult(TString&& data, const TDuration& cpuTime) + : Data(std::move(data)) + , CpuTime(cpuTime) + {} + + TEvDecompressDataResult(std::exception_ptr exception, const TDuration& cpuTime) + : Exception(exception) + , CpuTime(cpuTime) + {} + TString Data; std::exception_ptr Exception; + TDuration CpuTime; }; struct TEvDecompressDataFinish : public NActors::TEventLocal { + TEvDecompressDataFinish(const TDuration& cpuTime) + : CpuTime(cpuTime) + {} + + TDuration CpuTime; }; struct TReadRange { diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp index 47bcdf156cc5..7a16fc818aa4 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp @@ -481,7 +481,7 @@ class TS3DqIntegration: public TDqIntegrationBase { paths.clear(); ReadPathsList(srcDesc, {}, serialized, paths); - NDq::TS3ReadActorFactoryConfig readActorConfig; + const NDq::TS3ReadActorFactoryConfig& readActorConfig = State_->Configuration->S3ReadActorFactoryConfig; ui64 fileSizeLimit = readActorConfig.FileSizeLimit; if (srcDesc.HasFormat()) { if (auto it = readActorConfig.FormatSizeLimits.find(srcDesc.GetFormat()); it != readActorConfig.FormatSizeLimits.end()) { diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_settings.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_settings.cpp index cace7a62e1fc..f01a1267ca13 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_settings.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_settings.cpp @@ -46,6 +46,7 @@ void TS3Configuration::Init(const TS3GatewayConfig& config, TIntrusivePtr #include +#include namespace NYql { @@ -69,6 +70,7 @@ struct TS3Configuration : public TS3Settings, public NCommon::TSettingDispatcher bool WriteThroughDqIntegration = false; ui64 MaxListingResultSizePerPhysicalPartition; bool AllowAtomicUploadCommit = true; + NYql::NDq::TS3ReadActorFactoryConfig S3ReadActorFactoryConfig; }; } // NYql diff --git a/ydb/tests/fq/pq_async_io/ut_helpers.cpp b/ydb/tests/fq/pq_async_io/ut_helpers.cpp index bbb15edd85f9..907638e9fa1d 100644 --- a/ydb/tests/fq/pq_async_io/ut_helpers.cpp +++ b/ydb/tests/fq/pq_async_io/ut_helpers.cpp @@ -83,6 +83,7 @@ void TPqIoTestFixture::InitSource( nullptr, actor.SelfId(), actor.GetHolderFactory(), + MakeIntrusive(), freeSpace); actor.InitAsyncInput(dqSource, dqSourceAsActor); diff --git a/ydb/tests/fq/s3/conftest.py b/ydb/tests/fq/s3/conftest.py index 786ca79343aa..12eeea711880 100644 --- a/ydb/tests/fq/s3/conftest.py +++ b/ydb/tests/fq/s3/conftest.py @@ -35,9 +35,10 @@ def __init__(self, tests_count_limit, error_string): def on_test_start(self): self.number_tests += 1 - assert self.number_tests <= self.tests_count_limit, \ - f"{self.error_string} exceeded limit {self.number_tests} vs {self.tests_count_limit}, " \ + assert self.number_tests <= self.tests_count_limit, ( + f"{self.error_string} exceeded limit {self.number_tests} vs {self.tests_count_limit}, " "this may lead timeouts on CI, please split this file" + ) @pytest.fixture(scope="module") @@ -104,7 +105,9 @@ def kikimr_starts_counter(): @pytest.fixture(scope="module") -def kikimr_yqv1(kikimr_params: pytest.FixtureRequest, s3: S3, kikimr_settings, mvp_external_ydb_endpoint, kikimr_starts_counter): +def kikimr_yqv1( + kikimr_params: pytest.FixtureRequest, s3: S3, kikimr_settings, mvp_external_ydb_endpoint, kikimr_starts_counter +): kikimr_starts_counter.on_test_start() kikimr_extensions = get_kikimr_extensions(s3, YQV1_VERSION_NAME, kikimr_settings, mvp_external_ydb_endpoint) with start_kikimr(kikimr_params, kikimr_extensions) as kikimr: @@ -112,7 +115,9 @@ def kikimr_yqv1(kikimr_params: pytest.FixtureRequest, s3: S3, kikimr_settings, m @pytest.fixture(scope="module") -def kikimr_yqv2(kikimr_params: pytest.FixtureRequest, s3: S3, kikimr_settings, mvp_external_ydb_endpoint, kikimr_starts_counter): +def kikimr_yqv2( + kikimr_params: pytest.FixtureRequest, s3: S3, kikimr_settings, mvp_external_ydb_endpoint, kikimr_starts_counter +): kikimr_starts_counter.on_test_start() kikimr_extensions = get_kikimr_extensions(s3, YQV2_VERSION_NAME, kikimr_settings, mvp_external_ydb_endpoint) with start_kikimr(kikimr_params, kikimr_extensions) as kikimr: diff --git a/ydb/tests/fq/s3/test_format_setting.py b/ydb/tests/fq/s3/test_format_setting.py index 45e8bc5a6c20..5ef18c267042 100644 --- a/ydb/tests/fq/s3/test_format_setting.py +++ b/ydb/tests/fq/s3/test_format_setting.py @@ -744,9 +744,11 @@ def test_date_time_simple_posix_big_file( connection_id=connection_response.result.connection_id, columns=[a, b], format_setting={ - "data.datetime.format" - if format_name != "ISO" and format_name != "POSIX" - else "data.datetime.format_name": format_name + ( + "data.datetime.format" + if format_name != "ISO" and format_name != "POSIX" + else "data.datetime.format_name" + ): format_name }, ) diff --git a/ydb/tests/fq/s3/test_ydb_over_fq.py b/ydb/tests/fq/s3/test_ydb_over_fq.py index 155fce37990b..571b352acc42 100644 --- a/ydb/tests/fq/s3/test_ydb_over_fq.py +++ b/ydb/tests/fq/s3/test_ydb_over_fq.py @@ -362,3 +362,31 @@ def test_describe_table(self, kikimr, s3, client, unique_prefix): assert column.type == ydb.PrimitiveType.Int32 else: assert False + + @yq_all + @pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True) + def test_insert_data_query(self, kikimr, s3, client, unique_prefix, yq_version): + kikimr.control_plane.wait_bootstrap() + connection_id = client.create_storage_connection(unique_prefix + "fruitbucket", "fbucket").result.connection_id + bind_name = unique_prefix + "fruits_bind" + self.make_binding( + client, + bind_name, + "/sub/", + connection_id, + [("Fruit", "STRING"), ("Price", "INT32"), ("Weight", "INT32")], + ) + driver = self.make_yq_driver(kikimr.endpoint(), client.folder_id, "root@builtin") + session = driver.table_client.session().create() + with session.transaction() as tx: + query = ''' + insert into {}{} + select + 'Banana' as `Fruit`, + 3 as Price, + 100 as Weight + '''.format( + "bindings." if yq_version == "v1" else "", bind_name + ) + result = tx.execute(query) + assert len(result) == 0, str(result) diff --git a/ydb/tools/query_replay/query_compiler.cpp b/ydb/tools/query_replay/query_compiler.cpp index 251ee0c49002..53d9f816a623 100644 --- a/ydb/tools/query_replay/query_compiler.cpp +++ b/ydb/tools/query_replay/query_compiler.cpp @@ -290,7 +290,7 @@ class TReplayCompileActor: public TActorBootstrapped { Gateway = CreateKikimrIcGateway(Query->Cluster, NKikimrKqp::QUERY_TYPE_SQL_GENERIC_QUERY, Query->Database, std::move(loader), TlsActivationContext->ExecutorThread.ActorSystem, SelfId().NodeId(), counters); - auto federatedQuerySetup = std::make_optional({NYql::IHTTPGateway::Make(), nullptr, nullptr, nullptr, {}, {}, {}, nullptr, nullptr}); + auto federatedQuerySetup = std::make_optional({NYql::IHTTPGateway::Make(), nullptr, nullptr, nullptr, {}, {}, {}, nullptr, nullptr, {}}); KqpHost = CreateKqpHost(Gateway, Query->Cluster, Query->Database, Config, ModuleResolverState->ModuleResolver, federatedQuerySetup, nullptr, GUCSettings, Nothing(), FunctionRegistry, false); diff --git a/ydb/tools/query_replay_yt/query_compiler.cpp b/ydb/tools/query_replay_yt/query_compiler.cpp index b5d69560a834..d354c69bec91 100644 --- a/ydb/tools/query_replay_yt/query_compiler.cpp +++ b/ydb/tools/query_replay_yt/query_compiler.cpp @@ -594,7 +594,7 @@ class TReplayCompileActor: public TActorBootstrapped { Gateway = CreateKikimrIcGateway(Query->Cluster, NKikimrKqp::QUERY_TYPE_SQL_GENERIC_QUERY, Query->Database, std::move(loader), TlsActivationContext->ExecutorThread.ActorSystem, SelfId().NodeId(), counters); - auto federatedQuerySetup = std::make_optional({NYql::IHTTPGateway::Make(), nullptr, nullptr, nullptr, {}, {}, {}, nullptr, nullptr}); + auto federatedQuerySetup = std::make_optional({NYql::IHTTPGateway::Make(), nullptr, nullptr, nullptr, {}, {}, {}, nullptr, nullptr, {}}); KqpHost = CreateKqpHost(Gateway, Query->Cluster, Query->Database, Config, ModuleResolverState->ModuleResolver, federatedQuerySetup, nullptr, GUCSettings, Nothing(), FunctionRegistry, false); From c7dd98ecd3d45535b6bd83d5ffa99b9d35a70e73 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 7 Aug 2024 14:33:11 +0300 Subject: [PATCH 07/56] YDB-2568 Enable match_recognize in ydb / q-stable-2024-07-08 (#7488) --- .../kqp/compile_service/kqp_compile_actor.cpp | 2 +- .../kqp/executer_actor/ut/kqp_executer_ut.cpp | 3 +- ydb/core/kqp/host/kqp_host.cpp | 13 ++- ydb/core/kqp/host/kqp_host.h | 2 +- ydb/core/kqp/opt/logical/kqp_opt_log.cpp | 10 +++ .../kqp/session_actor/kqp_worker_actor.cpp | 2 +- ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp | 2 +- ydb/core/kqp/ut/yql/kqp_pragma_ut.cpp | 87 +++++++++++++++++++ ydb/core/protos/config.proto | 1 + .../yql/core/yql_opt_match_recognize.cpp | 42 +++------ .../sql/dq_file/part16/canondata/result.json | 6 +- .../sql/dq_file/part17/canondata/result.json | 6 +- .../sql/dq_file/part19/canondata/result.json | 22 +++++ .../extracted | 5 ++ .../hybrid_file/part0/canondata/result.json | 6 +- .../hybrid_file/part3/canondata/result.json | 6 +- .../hybrid_file/part9/canondata/result.json | 14 +++ .../tests/sql/sql2yql/canondata/result.json | 26 ++++-- .../match_recognize/alerts_without_order.sql | 59 +++++++++++++ .../sql/suites/match_recognize/test_type.sql | 5 ++ .../part16/canondata/result.json | 6 +- .../part17/canondata/result.json | 6 +- .../part19/canondata/result.json | 21 +++++ .../extracted | 5 ++ .../yql/tools/dqrun/examples/gateways.conf | 3 + ydb/tests/fq/yt/cfg/kqprun_config.conf | 1 + ydb/tests/fq/yt/kqp_yt_file.py | 1 - .../kqprun/configuration/app_config.conf | 1 + ydb/tools/query_replay/query_compiler.cpp | 2 +- ydb/tools/query_replay_yt/query_compiler.cpp | 2 +- 30 files changed, 300 insertions(+), 67 deletions(-) create mode 100644 ydb/library/yql/tests/sql/dq_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted create mode 100644 ydb/library/yql/tests/sql/suites/match_recognize/alerts_without_order.sql create mode 100644 ydb/library/yql/tests/sql/yt_native_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted diff --git a/ydb/core/kqp/compile_service/kqp_compile_actor.cpp b/ydb/core/kqp/compile_service/kqp_compile_actor.cpp index 776d2c534583..c04e0fc00d53 100644 --- a/ydb/core/kqp/compile_service/kqp_compile_actor.cpp +++ b/ydb/core/kqp/compile_service/kqp_compile_actor.cpp @@ -275,7 +275,7 @@ class TKqpCompileActor : public TActorBootstrapped { Config->FeatureFlags = AppData(ctx)->FeatureFlags; KqpHost = CreateKqpHost(Gateway, QueryId.Cluster, QueryId.Database, Config, ModuleResolverState->ModuleResolver, - FederatedQuerySetup, UserToken, GUCSettings, ApplicationName, AppData(ctx)->FunctionRegistry, + FederatedQuerySetup, UserToken, GUCSettings, QueryServiceConfig, ApplicationName, AppData(ctx)->FunctionRegistry, false, false, std::move(TempTablesState), nullptr, SplitCtx); IKqpHost::TPrepareSettings prepareSettings; diff --git a/ydb/core/kqp/executer_actor/ut/kqp_executer_ut.cpp b/ydb/core/kqp/executer_actor/ut/kqp_executer_ut.cpp index 4889ee332b27..b358f0efcf1c 100644 --- a/ydb/core/kqp/executer_actor/ut/kqp_executer_ut.cpp +++ b/ydb/core/kqp/executer_actor/ut/kqp_executer_ut.cpp @@ -8,6 +8,7 @@ #include #include +#include namespace NKikimr { namespace NKqp { @@ -28,7 +29,7 @@ NKqpProto::TKqpPhyTx BuildTxPlan(const TString& sql, TIntrusivePtr IModuleResolver::TPtr moduleResolver; UNIT_ASSERT(GetYqlDefaultModuleResolver(moduleCtx, moduleResolver)); - auto qp = CreateKqpHost(gateway, cluster, "/Root", config, moduleResolver, NYql::IHTTPGateway::Make(), nullptr, nullptr, Nothing(), nullptr, nullptr, false, false, nullptr, actorSystem); + auto qp = CreateKqpHost(gateway, cluster, "/Root", config, moduleResolver, NYql::IHTTPGateway::Make(), nullptr, nullptr, NKikimrConfig::TQueryServiceConfig(), Nothing(), nullptr, nullptr, false, false, nullptr, actorSystem, nullptr); auto result = qp->SyncPrepareDataQuery(sql, IKqpHost::TPrepareSettings()); result.Issues().PrintTo(Cerr); UNIT_ASSERT(result.Success()); diff --git a/ydb/core/kqp/host/kqp_host.cpp b/ydb/core/kqp/host/kqp_host.cpp index 01c8b51843af..972689af9f48 100644 --- a/ydb/core/kqp/host/kqp_host.cpp +++ b/ydb/core/kqp/host/kqp_host.cpp @@ -1033,7 +1033,7 @@ class TKqpHost : public IKqpHost { std::optional federatedQuerySetup, const TIntrusiveConstPtr& userToken, const NKikimr::NMiniKQL::IFunctionRegistry* funcRegistry, bool keepConfigChanges, bool isInternalCall, TKqpTempTablesState::TConstPtr tempTablesState = nullptr, NActors::TActorSystem* actorSystem = nullptr, - NYql::TExprContext* ctx = nullptr) + NYql::TExprContext* ctx = nullptr, const NKikimrConfig::TQueryServiceConfig& queryServiceConfig = NKikimrConfig::TQueryServiceConfig()) : Gateway(gateway) , Cluster(cluster) , GUCSettings(gUCSettings) @@ -1051,6 +1051,7 @@ class TKqpHost : public IKqpHost { , FakeWorld(ctx ? nullptr : ExprCtx->NewWorld(TPosition())) , ExecuteCtx(MakeIntrusive()) , ActorSystem(actorSystem ? actorSystem : NActors::TActivationContext::ActorSystem()) + , QueryServiceConfig(queryServiceConfig) { if (funcRegistry) { FuncRegistry = funcRegistry; @@ -1825,10 +1826,15 @@ class TKqpHost : public IKqpHost { || settingName == "FilterPushdownOverJoinOptionalSide" || settingName == "DisableFilterPushdownOverJoinOptionalSide" || settingName == "RotateJoinTree" + || settingName == "TimeOrderRecoverDelay" + || settingName == "TimeOrderRecoverAhead" + || settingName == "TimeOrderRecoverRowLimit" + || settingName == "MatchRecognizeStream" ; }; auto configProvider = CreateConfigProvider(*TypesCtx, gatewaysConfig, {}, allowSettings); TypesCtx->AddDataSource(ConfigProviderName, configProvider); + TypesCtx->MatchRecognize = QueryServiceConfig.GetEnableMatchRecognize(); YQL_ENSURE(TypesCtx->Initialize(*ExprCtx)); @@ -1930,6 +1936,7 @@ class TKqpHost : public IKqpHost { TKqpTempTablesState::TConstPtr TempTablesState; NActors::TActorSystem* ActorSystem = nullptr; + NKikimrConfig::TQueryServiceConfig QueryServiceConfig; }; } // namespace @@ -1950,11 +1957,11 @@ Ydb::Table::QueryStatsCollection::Mode GetStatsMode(NYql::EKikimrStatsMode stats TIntrusivePtr CreateKqpHost(TIntrusivePtr gateway, const TString& cluster, const TString& database, TKikimrConfiguration::TPtr config, IModuleResolver::TPtr moduleResolver, std::optional federatedQuerySetup, const TIntrusiveConstPtr& userToken, const TGUCSettings::TPtr& gUCSettings, - const TMaybe& applicationName, const NKikimr::NMiniKQL::IFunctionRegistry* funcRegistry, bool keepConfigChanges, + const NKikimrConfig::TQueryServiceConfig& queryServiceConfig, const TMaybe& applicationName, const NKikimr::NMiniKQL::IFunctionRegistry* funcRegistry, bool keepConfigChanges, bool isInternalCall, TKqpTempTablesState::TConstPtr tempTablesState, NActors::TActorSystem* actorSystem, NYql::TExprContext* ctx) { return MakeIntrusive(gateway, cluster, database, gUCSettings, applicationName, config, moduleResolver, federatedQuerySetup, userToken, funcRegistry, - keepConfigChanges, isInternalCall, std::move(tempTablesState), actorSystem, ctx); + keepConfigChanges, isInternalCall, std::move(tempTablesState), actorSystem, ctx, queryServiceConfig); } } // namespace NKqp diff --git a/ydb/core/kqp/host/kqp_host.h b/ydb/core/kqp/host/kqp_host.h index ff94738619f0..85a7025a9e1b 100644 --- a/ydb/core/kqp/host/kqp_host.h +++ b/ydb/core/kqp/host/kqp_host.h @@ -120,7 +120,7 @@ class IKqpHost : public TThrRefBase { TIntrusivePtr CreateKqpHost(TIntrusivePtr gateway, const TString& cluster, const TString& database, NYql::TKikimrConfiguration::TPtr config, NYql::IModuleResolver::TPtr moduleResolver, std::optional federatedQuerySetup, const TIntrusiveConstPtr& userToken, const TGUCSettings::TPtr& gUCSettings, - const TMaybe& applicationName = Nothing(), const NKikimr::NMiniKQL::IFunctionRegistry* funcRegistry = nullptr, + const NKikimrConfig::TQueryServiceConfig& queryServiceConfig, const TMaybe& applicationName = Nothing(), const NKikimr::NMiniKQL::IFunctionRegistry* funcRegistry = nullptr, bool keepConfigChanges = false, bool isInternalCall = false, TKqpTempTablesState::TConstPtr tempTablesState = nullptr, NActors::TActorSystem* actorSystem = nullptr /*take from TLS by default*/, NYql::TExprContext* ctx = nullptr); diff --git a/ydb/core/kqp/opt/logical/kqp_opt_log.cpp b/ydb/core/kqp/opt/logical/kqp_opt_log.cpp index 2ea6edf076b5..7dfea31d5417 100644 --- a/ydb/core/kqp/opt/logical/kqp_opt_log.cpp +++ b/ydb/core/kqp/opt/logical/kqp_opt_log.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -61,6 +62,7 @@ class TKqpLogicalOptTransformer : public TOptimizeTransformerBase { AddHandler(0, &TCoNarrowFlatMap::Match, HNDL(DqReadWideWrapFieldSubset)); AddHandler(0, &TCoNarrowMultiMap::Match, HNDL(DqReadWideWrapFieldSubset)); AddHandler(0, &TCoWideMap::Match, HNDL(DqReadWideWrapFieldSubset)); + AddHandler(0, &TCoMatchRecognize::Match, HNDL(MatchRecognize)); AddHandler(1, &TCoFlatMap::Match, HNDL(LatePushExtractedPredicateToReadTable)); AddHandler(1, &TCoTop::Match, HNDL(RewriteTopSortOverIndexRead)); @@ -310,6 +312,14 @@ class TKqpLogicalOptTransformer : public TOptimizeTransformerBase { return output; } + TMaybeNode MatchRecognize(TExprBase node, TExprContext& ctx) { + auto output = ExpandMatchRecognize(node.Ptr(), ctx, TypesCtx); + if (output) { + DumpAppliedRule("MatchRecognize", node.Ptr(), output, ctx); + } + return output; + } + TMaybeNode DqReadWrapByProvider(TExprBase node, TExprContext& ctx) { auto output = NDq::DqReadWrapByProvider(node, ctx, TypesCtx); if (output) { diff --git a/ydb/core/kqp/session_actor/kqp_worker_actor.cpp b/ydb/core/kqp/session_actor/kqp_worker_actor.cpp index 870ffa599ff7..024a094d0e4d 100644 --- a/ydb/core/kqp/session_actor/kqp_worker_actor.cpp +++ b/ydb/core/kqp/session_actor/kqp_worker_actor.cpp @@ -188,7 +188,7 @@ class TKqpWorkerActor : public TActorBootstrapped { Config->FeatureFlags = AppData(ctx)->FeatureFlags; KqpHost = CreateKqpHost(Gateway, Settings.Cluster, Settings.Database, Config, ModuleResolverState->ModuleResolver, FederatedQuerySetup, - QueryState->RequestEv->GetUserToken(), GUCSettings, Settings.ApplicationName, AppData(ctx)->FunctionRegistry, !Settings.LongSession, false); + QueryState->RequestEv->GetUserToken(), GUCSettings, QueryServiceConfig, Settings.ApplicationName, AppData(ctx)->FunctionRegistry, !Settings.LongSession, false, nullptr, nullptr, nullptr); auto& queryRequest = QueryState->RequestEv; QueryState->ProxyRequestId = proxyRequestId; diff --git a/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp b/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp index 4a349c0e5227..6f334a186107 100644 --- a/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp +++ b/ydb/core/kqp/ut/indexes/kqp_indexes_ut.cpp @@ -55,7 +55,7 @@ TIntrusivePtr CreateKikimrQueryProcessor(TIntrusivePtr ga auto federatedQuerySetup = std::make_optional({NYql::IHTTPGateway::Make(), nullptr, nullptr, nullptr, {}, {}, {}, nullptr, nullptr, {}}); return NKqp::CreateKqpHost(gateway, cluster, "/Root", kikimrConfig, moduleResolver, - federatedQuerySetup, nullptr, nullptr, {}, funcRegistry, funcRegistry, keepConfigChanges, nullptr, actorSystem); + federatedQuerySetup, nullptr, nullptr, NKikimrConfig::TQueryServiceConfig(), {}, funcRegistry, funcRegistry, keepConfigChanges, nullptr, actorSystem, nullptr); } NYql::NNodes::TExprBase GetExpr(const TString& ast, NYql::TExprContext& ctx, NYql::IModuleResolver* moduleResolver) { diff --git a/ydb/core/kqp/ut/yql/kqp_pragma_ut.cpp b/ydb/core/kqp/ut/yql/kqp_pragma_ut.cpp index 8cd9b54a6857..edf5740b7ef2 100644 --- a/ydb/core/kqp/ut/yql/kqp_pragma_ut.cpp +++ b/ydb/core/kqp/ut/yql/kqp_pragma_ut.cpp @@ -84,6 +84,93 @@ Y_UNIT_TEST_SUITE(KqpPragma) { UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); UNIT_ASSERT_C(result.GetIssues().Empty(), result.GetIssues().ToString()); } + + Y_UNIT_TEST(MatchRecognizeWithTimeOrderRecoverer) { + TKikimrSettings settings; + NKikimrConfig::TAppConfig appConfig; + appConfig.MutableQueryServiceConfig()->SetEnableMatchRecognize(true); + settings.SetAppConfig(appConfig); + + TKikimrRunner kikimr(settings); + NYdb::NScripting::TScriptingClient client(kikimr.GetDriver()); + + auto result = client.ExecuteYqlScript(R"( + PRAGMA FeatureR010="prototype"; + + CREATE TABLE `/Root/NewTable` ( + dt Uint64, + value String, + PRIMARY KEY (dt) + ); + COMMIT; + + INSERT INTO `/Root/NewTable` (dt, value) VALUES + (1, 'value1'), (2, 'value2'), (3, 'value3'), (4, 'value4'); + COMMIT; + + SELECT * FROM (SELECT dt, value FROM `/Root/NewTable`) + MATCH_RECOGNIZE( + ORDER BY CAST(dt as Timestamp) + MEASURES + LAST(V1.dt) as v1, + LAST(V4.dt) as v4 + ONE ROW PER MATCH + PATTERN (V1 V* V4) + DEFINE + V1 as V1.value = "value1", + V as True, + V4 as V4.value = "value4" + ); + )").GetValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); + CompareYson(R"([ + [[1u];[4u]]; + ])", FormatResultSetYson(result.GetResultSet(0))); + } + + Y_UNIT_TEST(MatchRecognizeWithoutTimeOrderRecoverer) { + TKikimrSettings settings; + NKikimrConfig::TAppConfig appConfig; + appConfig.MutableQueryServiceConfig()->SetEnableMatchRecognize(true); + settings.SetAppConfig(appConfig); + + TKikimrRunner kikimr(settings); + NYdb::NScripting::TScriptingClient client(kikimr.GetDriver()); + + auto result = client.ExecuteYqlScript(R"( + PRAGMA FeatureR010="prototype"; + PRAGMA config.flags("MatchRecognizeStream", "disable"); + + CREATE TABLE `/Root/NewTable` ( + dt Uint64, + value String, + PRIMARY KEY (dt) + ); + COMMIT; + + INSERT INTO `/Root/NewTable` (dt, value) VALUES + (1, 'value1'), (2, 'value2'), (3, 'value3'), (4, 'value4'); + COMMIT; + + SELECT * FROM (SELECT dt, value FROM `/Root/NewTable`) + MATCH_RECOGNIZE( + ORDER BY CAST(dt as Timestamp) + MEASURES + LAST(V1.dt) as v1, + LAST(V4.dt) as v4 + ONE ROW PER MATCH + PATTERN (V1 V* V4) + DEFINE + V1 as V1.value = "value1", + V as True, + V4 as V4.value = "value4" + ); + )").GetValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(result.GetStatus(), EStatus::SUCCESS, result.GetIssues().ToString()); + CompareYson(R"([ + [[1u];[4u]]; + ])", FormatResultSetYson(result.GetResultSet(0))); + } } } // namspace NKqp diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto index e5ced35b5a23..b929e0f1bfa1 100644 --- a/ydb/core/protos/config.proto +++ b/ydb/core/protos/config.proto @@ -1014,6 +1014,7 @@ message TQueryServiceConfig { optional NYql.TGenericGatewayConfig Generic = 11; optional TFinalizeScriptServiceConfig FinalizeScriptServiceConfig = 12; optional uint64 ProgressStatsPeriodMs = 14 [default = 0]; // 0 = disabled + optional bool EnableMatchRecognize = 20 [default = false]; } // Config describes immediate controls and allows diff --git a/ydb/library/yql/core/yql_opt_match_recognize.cpp b/ydb/library/yql/core/yql_opt_match_recognize.cpp index 5e99874b5415..b7f05afa37ab 100644 --- a/ydb/library/yql/core/yql_opt_match_recognize.cpp +++ b/ydb/library/yql/core/yql_opt_match_recognize.cpp @@ -146,7 +146,7 @@ TExprNode::TPtr ExpandMatchRecognize(const TExprNode::TPtr& node, TExprContext& ExtractSortKeyAndOrder(pos, sortTraits, sortKey, sortOrder, ctx); TExprNode::TPtr result; if (isStreaming) { - YQL_ENSURE(sortOrder->ChildrenSize() == 1, "Expect ORDER BY timestamp for MATCH_RECOGNIZE on streams"); + YQL_ENSURE(sortOrder->ChildrenSize() == 1, "Expect ORDER BY timestamp for MATCH_RECOGNIZE"); const auto reordered = ctx.Builder(pos) .Lambda() .Param("partition") @@ -216,37 +216,15 @@ TExprNode::TPtr ExpandMatchRecognize(const TExprNode::TPtr& node, TExprContext& .Seal() .Build(); } else { //non-streaming - if (partitionColumns->ChildrenSize() != 0) { - result = ctx.Builder(pos) - .Callable("PartitionsByKeys") - .Add(0, input) - .Add(1, partitionKeySelector) - .Add(2, sortOrder) - .Add(3, sortKey) - .Add(4, matchRecognize) - .Seal() - .Build(); - } else { - if (sortOrder->IsCallable("Void")) { - result = ctx.Builder(pos) - .Apply(matchRecognize) - .With(0, input) - .Seal() - .Build();; - } else { - result = ctx.Builder(pos) - .Apply(matchRecognize) - .With(0) - .Callable("Sort") - .Add(0, input) - .Add(1, sortOrder) - .Add(2, sortKey) - .Seal() - .Done() - .Seal() - .Build(); - } - } + result = ctx.Builder(pos) + .Callable("PartitionsByKeys") + .Add(0, input) + .Add(1, partitionKeySelector) + .Add(2, sortOrder) + .Add(3, sortKey) + .Add(4, matchRecognize) + .Seal() + .Build(); } YQL_CLOG(INFO, Core) << "Expanded MatchRecognize"; return result; diff --git a/ydb/library/yql/tests/sql/dq_file/part16/canondata/result.json b/ydb/library/yql/tests/sql/dq_file/part16/canondata/result.json index 11b4b9d9354f..683010ea4763 100644 --- a/ydb/library/yql/tests/sql/dq_file/part16/canondata/result.json +++ b/ydb/library/yql/tests/sql/dq_file/part16/canondata/result.json @@ -1892,9 +1892,9 @@ ], "test.test[match_recognize-test_type-default.txt-Debug]": [ { - "checksum": "cb5512aae3f5566055b2388be6d114af", - "size": 3220, - "uri": "https://{canondata_backend}/1937367/518bbcf510ad7a43c5e77746bafd21ed0e3fdc6e/resource.tar.gz#test.test_match_recognize-test_type-default.txt-Debug_/opt.yql_patched" + "checksum": "648119cc488bae598a0936f9d2c82b7e", + "size": 3458, + "uri": "https://{canondata_backend}/1942173/c4d7dbc720e57397caf847cd2616b1362110ddd2/resource.tar.gz#test.test_match_recognize-test_type-default.txt-Debug_/opt.yql_patched" } ], "test.test[match_recognize-test_type-default.txt-Plan]": [ diff --git a/ydb/library/yql/tests/sql/dq_file/part17/canondata/result.json b/ydb/library/yql/tests/sql/dq_file/part17/canondata/result.json index 753c3b460808..7bcfcf2f5fc9 100644 --- a/ydb/library/yql/tests/sql/dq_file/part17/canondata/result.json +++ b/ydb/library/yql/tests/sql/dq_file/part17/canondata/result.json @@ -1717,9 +1717,9 @@ ], "test.test[match_recognize-alerts-default.txt-Debug]": [ { - "checksum": "782bb90b80a43308dfef1dbd81055b12", - "size": 5618, - "uri": "https://{canondata_backend}/1942173/e32f1de19c4f2770a6f215d1dc22bc97e318bf22/resource.tar.gz#test.test_match_recognize-alerts-default.txt-Debug_/opt.yql_patched" + "checksum": "c8b1e13d6da573f8a1afd415db1d00e7", + "size": 5787, + "uri": "https://{canondata_backend}/1917492/86ab0de654a60bf1e3145a3d8e3d7eae4a9f26b8/resource.tar.gz#test.test_match_recognize-alerts-default.txt-Debug_/opt.yql_patched" } ], "test.test[match_recognize-alerts-default.txt-Plan]": [ diff --git a/ydb/library/yql/tests/sql/dq_file/part19/canondata/result.json b/ydb/library/yql/tests/sql/dq_file/part19/canondata/result.json index 6d11e5118b6c..61a7443bb7eb 100644 --- a/ydb/library/yql/tests/sql/dq_file/part19/canondata/result.json +++ b/ydb/library/yql/tests/sql/dq_file/part19/canondata/result.json @@ -1784,6 +1784,28 @@ } ], "test.test[limit-empty_input_after_limit-default.txt-Results]": [], + "test.test[match_recognize-alerts_without_order-default.txt-Analyze]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1917492/ef839f70e5a2f493427f7f92ed00d26a993f6d4a/resource.tar.gz#test.test_match_recognize-alerts_without_order-default.txt-Analyze_/plan.txt" + } + ], + "test.test[match_recognize-alerts_without_order-default.txt-Debug]": [ + { + "checksum": "17c5c1f84ac65b6a82234cd0b0a41a68", + "size": 5699, + "uri": "https://{canondata_backend}/1917492/ef839f70e5a2f493427f7f92ed00d26a993f6d4a/resource.tar.gz#test.test_match_recognize-alerts_without_order-default.txt-Debug_/opt.yql_patched" + } + ], + "test.test[match_recognize-alerts_without_order-default.txt-Plan]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1917492/ef839f70e5a2f493427f7f92ed00d26a993f6d4a/resource.tar.gz#test.test_match_recognize-alerts_without_order-default.txt-Plan_/plan.txt" + } + ], + "test.test[match_recognize-alerts_without_order-default.txt-Results]": [], "test.test[optimizers-unused_columns_group_one_of_multi--Analyze]": [ { "checksum": "ffcfe803a5b4bbfe9af72cc128197217", diff --git a/ydb/library/yql/tests/sql/dq_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted b/ydb/library/yql/tests/sql/dq_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted new file mode 100644 index 000000000000..abd564f4a8c8 --- /dev/null +++ b/ydb/library/yql/tests/sql/dq_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted @@ -0,0 +1,5 @@ +/program.sql:
: Fatal: Optimization + + /program.sql:
:8:1: Fatal: ydb/library/yql/core/yql_opt_match_recognize.cpp:xxx ExpandMatchRecognize(): requirement sortOrder->ChildrenSize() == 1 failed, message: Expect ORDER BY timestamp for MATCH_RECOGNIZE + select * from (select * from AS_TABLE($data) MATCH_RECOGNIZE( + ^ \ No newline at end of file diff --git a/ydb/library/yql/tests/sql/hybrid_file/part0/canondata/result.json b/ydb/library/yql/tests/sql/hybrid_file/part0/canondata/result.json index 037e6e59e5d6..3efd9dc0f833 100644 --- a/ydb/library/yql/tests/sql/hybrid_file/part0/canondata/result.json +++ b/ydb/library/yql/tests/sql/hybrid_file/part0/canondata/result.json @@ -1653,9 +1653,9 @@ ], "test.test[match_recognize-alerts-default.txt-Debug]": [ { - "checksum": "900161f08e14b0b4c725130ed055ca73", - "size": 5617, - "uri": "https://{canondata_backend}/1880306/5d2fb97b23cd70975bc5d744391981f9d5595c04/resource.tar.gz#test.test_match_recognize-alerts-default.txt-Debug_/opt.yql_patched" + "checksum": "902f8b167c5875200480d237a6493bb7", + "size": 5786, + "uri": "https://{canondata_backend}/1903885/f00a3197fa44aa3d49bf7fe1bbf0fed52ce265b9/resource.tar.gz#test.test_match_recognize-alerts-default.txt-Debug_/opt.yql_patched" } ], "test.test[match_recognize-alerts-default.txt-Plan]": [ diff --git a/ydb/library/yql/tests/sql/hybrid_file/part3/canondata/result.json b/ydb/library/yql/tests/sql/hybrid_file/part3/canondata/result.json index 523ccc4dbc3d..08304314316f 100644 --- a/ydb/library/yql/tests/sql/hybrid_file/part3/canondata/result.json +++ b/ydb/library/yql/tests/sql/hybrid_file/part3/canondata/result.json @@ -1737,9 +1737,9 @@ ], "test.test[match_recognize-test_type-default.txt-Debug]": [ { - "checksum": "e0e549a969a71f8fddbd772e08bbeebe", - "size": 3219, - "uri": "https://{canondata_backend}/1937492/7ae37c32b42bb57d4df171a62ced7ab76867a8ea/resource.tar.gz#test.test_match_recognize-test_type-default.txt-Debug_/opt.yql_patched" + "checksum": "367551185530c7b04aa9da2f8afa111f", + "size": 3457, + "uri": "https://{canondata_backend}/1903885/a4d0122d8471ff0ca85352e617bed922d9ad8df1/resource.tar.gz#test.test_match_recognize-test_type-default.txt-Debug_/opt.yql_patched" } ], "test.test[match_recognize-test_type-default.txt-Plan]": [ diff --git a/ydb/library/yql/tests/sql/hybrid_file/part9/canondata/result.json b/ydb/library/yql/tests/sql/hybrid_file/part9/canondata/result.json index 3b42ce6a3888..9a5584174dac 100644 --- a/ydb/library/yql/tests/sql/hybrid_file/part9/canondata/result.json +++ b/ydb/library/yql/tests/sql/hybrid_file/part9/canondata/result.json @@ -1511,6 +1511,20 @@ "uri": "https://{canondata_backend}/1900335/8eba31ae2dcfd9245ad9327a1ac3ca89667336e2/resource.tar.gz#test.test_limit-empty_input_after_limit-default.txt-Plan_/plan.txt" } ], + "test.test[match_recognize-alerts_without_order-default.txt-Debug]": [ + { + "checksum": "acba759d95a9b70640e6418dc1febb2d", + "size": 5698, + "uri": "https://{canondata_backend}/1937424/f54290c1c9e8b8c01bdab19c1d6ef1f76de15d9c/resource.tar.gz#test.test_match_recognize-alerts_without_order-default.txt-Debug_/opt.yql_patched" + } + ], + "test.test[match_recognize-alerts_without_order-default.txt-Plan]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1937424/f54290c1c9e8b8c01bdab19c1d6ef1f76de15d9c/resource.tar.gz#test.test_match_recognize-alerts_without_order-default.txt-Plan_/plan.txt" + } + ], "test.test[optimizers-direct_row_after_merge--Debug]": [ { "checksum": "6db94e68bc8d6ad4ae649044f1b0c9e9", diff --git a/ydb/library/yql/tests/sql/sql2yql/canondata/result.json b/ydb/library/yql/tests/sql/sql2yql/canondata/result.json index 9241a755e714..ab33e315fbbf 100644 --- a/ydb/library/yql/tests/sql/sql2yql/canondata/result.json +++ b/ydb/library/yql/tests/sql/sql2yql/canondata/result.json @@ -10758,6 +10758,13 @@ "uri": "https://{canondata_backend}/1784117/d56ae82ad9d30397a41490647be1bd2124718f98/resource.tar.gz#test_sql2yql.test_match_recognize-alerts_/sql.yql" } ], + "test_sql2yql.test[match_recognize-alerts_without_order]": [ + { + "checksum": "4a7d1c9ca704a076217e529b5489ad87", + "size": 8780, + "uri": "https://{canondata_backend}/1937001/f1ec239726ab3e2cf00695f3d10461ff9ef6c3b0/resource.tar.gz#test_sql2yql.test_match_recognize-alerts_without_order_/sql.yql" + } + ], "test_sql2yql.test[match_recognize-permute]": [ { "checksum": "05c45a70d86bca34be996277afae8bf9", @@ -10788,9 +10795,9 @@ ], "test_sql2yql.test[match_recognize-test_type]": [ { - "checksum": "1b5581aa704781439ce64e9fc4e3c21d", - "size": 9654, - "uri": "https://{canondata_backend}/1784117/d56ae82ad9d30397a41490647be1bd2124718f98/resource.tar.gz#test_sql2yql.test_match_recognize-test_type_/sql.yql" + "checksum": "0a5812e84f194b487eae4084027bd170", + "size": 10249, + "uri": "https://{canondata_backend}/1936842/c0fac16b134e7c8f865a197ac63738ced4fac271/resource.tar.gz#test_sql2yql.test_match_recognize-test_type_/sql.yql" } ], "test_sql2yql.test[match_recognize-test_type_predicate]": [ @@ -29987,6 +29994,13 @@ "uri": "https://{canondata_backend}/1937001/da4215d5087e56eec0224ec5e7754dafd0b2bdcf/resource.tar.gz#test_sql_format.test_match_recognize-alerts_/formatted.sql" } ], + "test_sql_format.test[match_recognize-alerts_without_order]": [ + { + "checksum": "779c2c3a4eab619646509ce5008863e8", + "size": 2906, + "uri": "https://{canondata_backend}/1937001/f1ec239726ab3e2cf00695f3d10461ff9ef6c3b0/resource.tar.gz#test_sql_format.test_match_recognize-alerts_without_order_/formatted.sql" + } + ], "test_sql_format.test[match_recognize-permute]": [ { "checksum": "998e6752ce413cc78e952b9958dfab74", @@ -30017,9 +30031,9 @@ ], "test_sql_format.test[match_recognize-test_type]": [ { - "checksum": "3fcf6d53720604b982ad58beed055a26", - "size": 1127, - "uri": "https://{canondata_backend}/1880306/64654158d6bfb1289c66c626a8162239289559d0/resource.tar.gz#test_sql_format.test_match_recognize-test_type_/formatted.sql" + "checksum": "36104b385f3b9986c22f409931b80564", + "size": 1302, + "uri": "https://{canondata_backend}/1936842/c0fac16b134e7c8f865a197ac63738ced4fac271/resource.tar.gz#test_sql_format.test_match_recognize-test_type_/formatted.sql" } ], "test_sql_format.test[match_recognize-test_type_predicate]": [ diff --git a/ydb/library/yql/tests/sql/suites/match_recognize/alerts_without_order.sql b/ydb/library/yql/tests/sql/suites/match_recognize/alerts_without_order.sql new file mode 100644 index 000000000000..7d92f0f18c7b --- /dev/null +++ b/ydb/library/yql/tests/sql/suites/match_recognize/alerts_without_order.sql @@ -0,0 +1,59 @@ +$osquery_data = [ +<|dt:1688910000, host:"fqdn1", ev_type:"someEv", ev_status:"", user:"", vpn:false, |>, +<|dt:1688910050, host:"fqdn2", ev_type:"login", ev_status:"success", user:"", vpn:true, |>, +<|dt:1688910100, host:"fqdn1", ev_type:"login", ev_status:"success", user:"", vpn:true, |>, +<|dt:1688910220, host:"fqdn1", ev_type:"login", ev_status:"success", user:"", vpn:false, |>, +<|dt:1688910300, host:"fqdn1", ev_type:"delete_all", ev_status:"", user:"", vpn:false, |>, +<|dt:1688910400, host:"fqdn2", ev_type:"delete_all", ev_status:"", user:"", vpn:false, |>, +<|dt:1688910500, host:"fqdn1", ev_type:"login", ev_status:"failed", user:"user1", vpn:false, |>, +<|dt:1688910500, host:"fqdn1", ev_type:"login", ev_status:"failed", user:"user2", vpn:false, |>, +<|dt:1688910600, host:"fqdn", ev_type:"someEv", ev_status:"", user:"user1", vpn:false, |>, +<|dt:1688910800, host:"fqdn2", ev_type:"login", ev_status:"failed", user:"user1", vpn:false, |>, +<|dt:1688910900, host:"fqdn2", ev_type:"login", ev_status:"failed", user:"user2", vpn:false, |>, +<|dt:1688911000, host:"fqdn2", ev_type:"login", ev_status:"success", user:"user1", vpn:false, |>, +]; + +pragma FeatureR010="prototype"; +pragma config.flags("MatchRecognizeStream", "disable"); + +SELECT * +FROM AS_TABLE($osquery_data) MATCH_RECOGNIZE( + MEASURES + LAST(SUSPICIOUS_ACTION_SOON.dt) as suspicious_action_dt, + LAST(LOGIN_SUCCESS_REMOTE.host) as remote_login_host, + LAST(LOGIN_SUCCESS_REMOTE.user) as remote_login_user, + LAST(LOGIN_SUCCESS_REMOTE.dt) as t, + FIRST(LOGIN_FAILED_SAME_USER.dt) as brutforce_begin, + FIRST(LOGIN_SUCCESS_SAME_USER.dt) as brutforce_end, + LAST(LOGIN_SUCCESS_SAME_USER.user) as brutforce_login + + ONE ROW PER MATCH + PATTERN ( + LOGIN_SUCCESS_REMOTE ANY_ROW* (SUSPICIOUS_ACTION_SOON | SUSPICIOUS_ACTION_TIMEOUT) | + (LOGIN_FAILED_SAME_USER ANY_ROW*){2,} LOGIN_SUCCESS_SAME_USER + ) + DEFINE + LOGIN_SUCCESS_REMOTE as + LOGIN_SUCCESS_REMOTE.ev_type = "login" and + LOGIN_SUCCESS_REMOTE.ev_status = "success" and + LOGIN_SUCCESS_REMOTE.vpn = true, + SUSPICIOUS_ACTION_SOON as + SUSPICIOUS_ACTION_SOON.host = LAST(LOGIN_SUCCESS_REMOTE.host) and + SUSPICIOUS_ACTION_SOON.ev_type = "delete_all" and + SUSPICIOUS_ACTION_SOON.dt - LAST(LOGIN_SUCCESS_REMOTE.dt) < 1000, + SUSPICIOUS_ACTION_TIMEOUT as + SUSPICIOUS_ACTION_TIMEOUT.dt - LAST(LOGIN_SUCCESS_REMOTE.dt) >= 1000, + + LOGIN_FAILED_SAME_USER as + LOGIN_FAILED_SAME_USER.ev_type = "login" and + LOGIN_FAILED_SAME_USER.ev_status <> "success" and + (LAST(LOGIN_FAILED_SAME_USER.user) IS NULL + or LAST(LOGIN_FAILED_SAME_USER.user) = LOGIN_FAILED_SAME_USER.user + ), + LOGIN_SUCCESS_SAME_USER as + LOGIN_SUCCESS_SAME_USER.ev_type = "login" and + LOGIN_SUCCESS_SAME_USER.ev_status = "success" and + LOGIN_SUCCESS_SAME_USER.user = LAST(LOGIN_FAILED_SAME_USER.user) +) AS MATCHED +; + diff --git a/ydb/library/yql/tests/sql/suites/match_recognize/test_type.sql b/ydb/library/yql/tests/sql/suites/match_recognize/test_type.sql index 1a5cdeaaf6bb..a92b5672e488 100644 --- a/ydb/library/yql/tests/sql/suites/match_recognize/test_type.sql +++ b/ydb/library/yql/tests/sql/suites/match_recognize/test_type.sql @@ -7,6 +7,7 @@ $data = [<|dt:4, host:"fqdn1", key:14|>]; -- NoPartitionNoMeasure select * from AS_TABLE($data) MATCH_RECOGNIZE( + ORDER BY CAST(dt as Timestamp) ONE ROW PER MATCH AFTER MATCH SKIP TO NEXT ROW PATTERN ( @@ -18,6 +19,7 @@ select * from AS_TABLE($data) MATCH_RECOGNIZE( --NoPartitionStringMeasure select * from AS_TABLE($data) MATCH_RECOGNIZE( + ORDER BY CAST(dt as Timestamp) MEASURES "SomeString" as Measure1 ONE ROW PER MATCH @@ -32,6 +34,7 @@ select * from AS_TABLE($data) MATCH_RECOGNIZE( --IntPartitionColNoMeasure select * from AS_TABLE($data) MATCH_RECOGNIZE( PARTITION BY dt + ORDER BY CAST(dt as Timestamp) ONE ROW PER MATCH AFTER MATCH SKIP TO NEXT ROW PATTERN ( @@ -44,6 +47,7 @@ select * from AS_TABLE($data) MATCH_RECOGNIZE( --StringPartitionColStringMeasure select * from AS_TABLE($data) MATCH_RECOGNIZE( PARTITION BY host + ORDER BY CAST(dt as Timestamp) MEASURES "SomeString" as Measure1 ONE ROW PER MATCH @@ -58,6 +62,7 @@ select * from AS_TABLE($data) MATCH_RECOGNIZE( --TwoPartitionColsTwoMeasures select * from AS_TABLE($data) MATCH_RECOGNIZE( PARTITION BY host, dt + ORDER BY CAST(dt as Timestamp) MEASURES "SomeString" as S, 345 as I diff --git a/ydb/library/yql/tests/sql/yt_native_file/part16/canondata/result.json b/ydb/library/yql/tests/sql/yt_native_file/part16/canondata/result.json index 584d2446df30..15873b984acd 100644 --- a/ydb/library/yql/tests/sql/yt_native_file/part16/canondata/result.json +++ b/ydb/library/yql/tests/sql/yt_native_file/part16/canondata/result.json @@ -1670,9 +1670,9 @@ ], "test.test[match_recognize-test_type-default.txt-Debug]": [ { - "checksum": "e7e3da81b0bcd6e16f054a55deeef34e", - "size": 3152, - "uri": "https://{canondata_backend}/1130705/85e00e1809c16d7a062c55ddef958687d825adb0/resource.tar.gz#test.test_match_recognize-test_type-default.txt-Debug_/opt.yql" + "checksum": "f3049326be52b62a5cf57002c3eebcf2", + "size": 3382, + "uri": "https://{canondata_backend}/1599023/f2c034a7162395b18a12a2a3caebebf38f158f60/resource.tar.gz#test.test_match_recognize-test_type-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-test_type-default.txt-Plan]": [ diff --git a/ydb/library/yql/tests/sql/yt_native_file/part17/canondata/result.json b/ydb/library/yql/tests/sql/yt_native_file/part17/canondata/result.json index fb88b6436fd8..32095e7958be 100644 --- a/ydb/library/yql/tests/sql/yt_native_file/part17/canondata/result.json +++ b/ydb/library/yql/tests/sql/yt_native_file/part17/canondata/result.json @@ -1477,9 +1477,9 @@ ], "test.test[match_recognize-alerts-default.txt-Debug]": [ { - "checksum": "e6883aec19c55794eb3df952b116a0f7", - "size": 5540, - "uri": "https://{canondata_backend}/1880306/2a33b9c798cdb676ceda243cdc609d2afef27554/resource.tar.gz#test.test_match_recognize-alerts-default.txt-Debug_/opt.yql" + "checksum": "40334ece1e6991ad870f1d1488b88b0a", + "size": 5709, + "uri": "https://{canondata_backend}/1599023/94042bc6ec9d078689120650754efa466e6c1d00/resource.tar.gz#test.test_match_recognize-alerts-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-alerts-default.txt-Plan]": [ diff --git a/ydb/library/yql/tests/sql/yt_native_file/part19/canondata/result.json b/ydb/library/yql/tests/sql/yt_native_file/part19/canondata/result.json index 7bff445dfd0f..2c743f411c5a 100644 --- a/ydb/library/yql/tests/sql/yt_native_file/part19/canondata/result.json +++ b/ydb/library/yql/tests/sql/yt_native_file/part19/canondata/result.json @@ -1632,6 +1632,27 @@ "uri": "https://{canondata_backend}/1942100/33d51fa00fd086c78b2c0087e9e9f2249eef2d76/resource.tar.gz#test.test_limit-empty_input_after_limit-default.txt-Results_/results.txt" } ], + "test.test[match_recognize-alerts_without_order-default.txt-Debug]": [ + { + "checksum": "46286b4ea128734b6e3ffae32164bc1e", + "size": 5622, + "uri": "https://{canondata_backend}/1130705/ac239a3807774cda911f256a24ef987fd0afe20f/resource.tar.gz#test.test_match_recognize-alerts_without_order-default.txt-Debug_/opt.yql" + } + ], + "test.test[match_recognize-alerts_without_order-default.txt-Plan]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1130705/ac239a3807774cda911f256a24ef987fd0afe20f/resource.tar.gz#test.test_match_recognize-alerts_without_order-default.txt-Plan_/plan.txt" + } + ], + "test.test[match_recognize-alerts_without_order-default.txt-Results]": [ + { + "checksum": "6e11c24a571d7b78308343a8fe4d0772", + "size": 4611, + "uri": "https://{canondata_backend}/1130705/ac239a3807774cda911f256a24ef987fd0afe20f/resource.tar.gz#test.test_match_recognize-alerts_without_order-default.txt-Results_/results.txt" + } + ], "test.test[optimizers-unused_columns_group_one_of_multi--Debug]": [ { "checksum": "52de67abfc854bb6cba714600c125055", diff --git a/ydb/library/yql/tests/sql/yt_native_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted b/ydb/library/yql/tests/sql/yt_native_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted new file mode 100644 index 000000000000..abd564f4a8c8 --- /dev/null +++ b/ydb/library/yql/tests/sql/yt_native_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted @@ -0,0 +1,5 @@ +/program.sql:
: Fatal: Optimization + + /program.sql:
:8:1: Fatal: ydb/library/yql/core/yql_opt_match_recognize.cpp:xxx ExpandMatchRecognize(): requirement sortOrder->ChildrenSize() == 1 failed, message: Expect ORDER BY timestamp for MATCH_RECOGNIZE + select * from (select * from AS_TABLE($data) MATCH_RECOGNIZE( + ^ \ No newline at end of file diff --git a/ydb/library/yql/tools/dqrun/examples/gateways.conf b/ydb/library/yql/tools/dqrun/examples/gateways.conf index e2d2217e7a21..e3c699301ed0 100644 --- a/ydb/library/yql/tools/dqrun/examples/gateways.conf +++ b/ydb/library/yql/tools/dqrun/examples/gateways.conf @@ -124,6 +124,9 @@ YqlCore { Flags { Name: "_EnableStreamLookupJoin" } + Flags { + Name: "_EnableMatchRecognize" + } } SqlCore { diff --git a/ydb/tests/fq/yt/cfg/kqprun_config.conf b/ydb/tests/fq/yt/cfg/kqprun_config.conf index 47e00870c153..7549887630c2 100644 --- a/ydb/tests/fq/yt/cfg/kqprun_config.conf +++ b/ydb/tests/fq/yt/cfg/kqprun_config.conf @@ -21,4 +21,5 @@ QueryServiceConfig { Value: "true" } } + EnableMatchRecognize: true } diff --git a/ydb/tests/fq/yt/kqp_yt_file.py b/ydb/tests/fq/yt/kqp_yt_file.py index 99bac78ba74f..1d4f87ac3d46 100644 --- a/ydb/tests/fq/yt/kqp_yt_file.py +++ b/ydb/tests/fq/yt/kqp_yt_file.py @@ -6,7 +6,6 @@ from yql_utils import KSV_ATTR, get_files, get_http_files, get_tables, is_xfail, yql_binary_path EXCLUDED_SUITES = [ - 'match_recognize', # MATCH_RECOGNIZE is disabled in KQP ] EXCLUDED_TESTS = [ diff --git a/ydb/tests/tools/kqprun/configuration/app_config.conf b/ydb/tests/tools/kqprun/configuration/app_config.conf index eebc7479f333..46cbe9bc2e51 100644 --- a/ydb/tests/tools/kqprun/configuration/app_config.conf +++ b/ydb/tests/tools/kqprun/configuration/app_config.conf @@ -30,6 +30,7 @@ QueryServiceConfig { QueryArtifactsCompressionMethod: "zstd_6" ScriptResultRowsLimit: 0 ScriptResultSizeLimit: 10485760 + EnableMatchRecognize: true FileStorage { MaxFiles: 1000 diff --git a/ydb/tools/query_replay/query_compiler.cpp b/ydb/tools/query_replay/query_compiler.cpp index 53d9f816a623..40046ca100ff 100644 --- a/ydb/tools/query_replay/query_compiler.cpp +++ b/ydb/tools/query_replay/query_compiler.cpp @@ -292,7 +292,7 @@ class TReplayCompileActor: public TActorBootstrapped { TlsActivationContext->ExecutorThread.ActorSystem, SelfId().NodeId(), counters); auto federatedQuerySetup = std::make_optional({NYql::IHTTPGateway::Make(), nullptr, nullptr, nullptr, {}, {}, {}, nullptr, nullptr, {}}); KqpHost = CreateKqpHost(Gateway, Query->Cluster, Query->Database, Config, ModuleResolverState->ModuleResolver, - federatedQuerySetup, nullptr, GUCSettings, Nothing(), FunctionRegistry, false); + federatedQuerySetup, nullptr, GUCSettings, NKikimrConfig::TQueryServiceConfig(), Nothing(), FunctionRegistry, false); IKqpHost::TPrepareSettings prepareSettings; prepareSettings.DocumentApiRestricted = false; diff --git a/ydb/tools/query_replay_yt/query_compiler.cpp b/ydb/tools/query_replay_yt/query_compiler.cpp index d354c69bec91..63cfaab8f5c4 100644 --- a/ydb/tools/query_replay_yt/query_compiler.cpp +++ b/ydb/tools/query_replay_yt/query_compiler.cpp @@ -596,7 +596,7 @@ class TReplayCompileActor: public TActorBootstrapped { TlsActivationContext->ExecutorThread.ActorSystem, SelfId().NodeId(), counters); auto federatedQuerySetup = std::make_optional({NYql::IHTTPGateway::Make(), nullptr, nullptr, nullptr, {}, {}, {}, nullptr, nullptr, {}}); KqpHost = CreateKqpHost(Gateway, Query->Cluster, Query->Database, Config, ModuleResolverState->ModuleResolver, - federatedQuerySetup, nullptr, GUCSettings, Nothing(), FunctionRegistry, false); + federatedQuerySetup, nullptr, GUCSettings, NKikimrConfig::TQueryServiceConfig(), Nothing(), FunctionRegistry, false); StartCompilation(); Continue(); From 9f80aafd8b24761963f47d0f933094b3a9179070 Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Fri, 9 Aug 2024 16:40:16 +0300 Subject: [PATCH 08/56] secrets have been fixed (#7409) (#7571) --- .../synchronization_service.cpp | 2 + .../actors/query_utils.cpp | 44 +++++++++++------ .../control_plane_proxy/actors/query_utils.h | 8 ++-- .../actors/ydb_schema_query_actor.cpp | 47 +++++++++++-------- 4 files changed, 65 insertions(+), 36 deletions(-) diff --git a/ydb/core/fq/libs/compute/ydb/synchronization_service/synchronization_service.cpp b/ydb/core/fq/libs/compute/ydb/synchronization_service/synchronization_service.cpp index 773dff40c888..0877d3e97f6a 100644 --- a/ydb/core/fq/libs/compute/ydb/synchronization_service/synchronization_service.cpp +++ b/ydb/core/fq/libs/compute/ydb/synchronization_service/synchronization_service.cpp @@ -396,6 +396,7 @@ class TSynchronizeScopeActor : public NActors::TActorBootstrappedGet()->YDBClient = Client; request.Get()->Get()->ComputeDatabase = ComputeDatabase; + request.Get()->Get()->Scope = Scope; Register(NFq::NPrivate::MakeCreateConnectionActor( SelfId(), @@ -425,6 +426,7 @@ class TSynchronizeScopeActor : public NActors::TActorBootstrappedGet()->YDBClient = Client; request.Get()->Get()->ComputeDatabase = ComputeDatabase; + request.Get()->Get()->Scope = Scope; auto it = Connections.find(binding.second.content().connection_id()); if (it == Connections.end()) { diff --git a/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp b/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp index 1a781f374205..c69f279be3e3 100644 --- a/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp +++ b/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp @@ -11,6 +11,14 @@ namespace NFq { namespace NPrivate { +namespace { + +TString MakeSecretKeyName(const TString& prefix, const TString& folderId, const TString& name) { + return TStringBuilder{} << prefix << "_" << folderId << "_" << name; +} + +} + TString MakeCreateExternalDataTableQuery(const FederatedQuery::BindingContent& content, const TString& connectionName, bool replaceIfExists) { @@ -94,7 +102,8 @@ TString SignAccountId(const TString& id, const TSigner::TPtr& signer) { TMaybe CreateSecretObjectQuery(const FederatedQuery::ConnectionSetting& setting, const TString& name, - const TSigner::TPtr& signer) { + const TSigner::TPtr& signer, + const TString& folderId) { using namespace fmt::literals; TString secretObjects; auto serviceAccountId = ExtractServiceAccountId(setting); @@ -103,7 +112,7 @@ TMaybe CreateSecretObjectQuery(const FederatedQuery::ConnectionSetting& R"( UPSERT OBJECT {sa_secret_name} (TYPE SECRET) WITH value={signature}; )", - "sa_secret_name"_a = EncloseAndEscapeString("k1" + name, '`'), + "sa_secret_name"_a = EncloseAndEscapeString(MakeSecretKeyName("f1", folderId, name), '`'), "signature"_a = EncloseSecret(EncloseAndEscapeString(SignAccountId(serviceAccountId, signer), '"'))) : std::string{}; } @@ -113,7 +122,7 @@ TMaybe CreateSecretObjectQuery(const FederatedQuery::ConnectionSetting& R"( UPSERT OBJECT {password_secret_name} (TYPE SECRET) WITH value={password}; )", - "password_secret_name"_a = EncloseAndEscapeString("k2" + name, '`'), + "password_secret_name"_a = EncloseAndEscapeString(MakeSecretKeyName("f2", folderId, name), '`'), "password"_a = EncloseSecret(EncloseAndEscapeString(*password, '"'))); } @@ -122,7 +131,8 @@ TMaybe CreateSecretObjectQuery(const FederatedQuery::ConnectionSetting& TString CreateAuthParamsQuery(const FederatedQuery::ConnectionSetting& setting, const TString& name, - const TSigner::TPtr& signer) { + const TSigner::TPtr& signer, + const TString& folderId) { using namespace fmt::literals; auto authMethod = GetYdbComputeAuthMethod(setting); switch (authMethod) { @@ -139,7 +149,7 @@ TString CreateAuthParamsQuery(const FederatedQuery::ConnectionSetting& setting, )", "auth_method"_a = ToString(authMethod), "service_account_id"_a = EncloseAndEscapeString(ExtractServiceAccountId(setting), '"'), - "sa_secret_name"_a = EncloseAndEscapeString(signer ? "k1" + name : TString{}, '"')); + "sa_secret_name"_a = EncloseAndEscapeString(signer ? MakeSecretKeyName("f1", folderId, name) : TString{}, '"')); case EYdbComputeAuth::BASIC: return fmt::format( R"(, @@ -149,7 +159,7 @@ TString CreateAuthParamsQuery(const FederatedQuery::ConnectionSetting& setting, )", "auth_method"_a = ToString(authMethod), "login"_a = EncloseAndEscapeString(GetLogin(setting).GetOrElse({}), '"'), - "password_secret_name"_a = EncloseAndEscapeString("k2" + name, '"')); + "password_secret_name"_a = EncloseAndEscapeString(MakeSecretKeyName("f2", folderId, name), '"')); case EYdbComputeAuth::MDB_BASIC: return fmt::format( R"(, @@ -161,9 +171,9 @@ TString CreateAuthParamsQuery(const FederatedQuery::ConnectionSetting& setting, )", "auth_method"_a = ToString(authMethod), "service_account_id"_a = EncloseAndEscapeString(ExtractServiceAccountId(setting), '"'), - "sa_secret_name"_a = EncloseAndEscapeString(signer ? "k1" + name : TString{}, '"'), + "sa_secret_name"_a = EncloseAndEscapeString(signer ? MakeSecretKeyName("f1", folderId, name) : TString{}, '"'), "login"_a = EncloseAndEscapeString(GetLogin(setting).GetOrElse({}), '"'), - "password_secret_name"_a = EncloseAndEscapeString("k2" + name, '"')); + "password_secret_name"_a = EncloseAndEscapeString(MakeSecretKeyName("f2", folderId, name), '"')); } } @@ -171,7 +181,8 @@ TString MakeCreateExternalDataSourceQuery( const FederatedQuery::ConnectionContent& connectionContent, const TSigner::TPtr& signer, const NConfig::TCommonConfig& common, - bool replaceIfExists) { + bool replaceIfExists, + const TString& folderId) { using namespace fmt::literals; TString properties; @@ -278,20 +289,25 @@ TString MakeCreateExternalDataSourceQuery( "auth_params"_a = CreateAuthParamsQuery(connectionContent.setting(), connectionContent.name(), - signer)); + signer, + folderId)); } -TMaybe DropSecretObjectQuery(const TString& name) { +TMaybe DropSecretObjectQuery(const TString& name, const TString& folderId) { using namespace fmt::literals; return fmt::format( R"( DROP OBJECT {secret_name1} (TYPE SECRET); DROP OBJECT {secret_name2} (TYPE SECRET); DROP OBJECT {secret_name3} (TYPE SECRET); -- for backward compatibility + DROP OBJECT {secret_name4} (TYPE SECRET); -- for backward compatibility + DROP OBJECT {secret_name5} (TYPE SECRET); -- for backward compatibility )", - "secret_name1"_a = EncloseAndEscapeString("k1" + name, '`'), - "secret_name2"_a = EncloseAndEscapeString("k2" + name, '`'), - "secret_name3"_a = EncloseAndEscapeString(name, '`')); + "secret_name1"_a = EncloseAndEscapeString(MakeSecretKeyName("f1", folderId, name), '`'), + "secret_name2"_a = EncloseAndEscapeString(MakeSecretKeyName("f2", folderId, name), '`'), + "secret_name3"_a = EncloseAndEscapeString(TStringBuilder{} << "k1" << name, '`'), + "secret_name4"_a = EncloseAndEscapeString(TStringBuilder{} << "k2" << name, '`'), + "secret_name5"_a = EncloseAndEscapeString(name, '`')); } TString MakeDeleteExternalDataTableQuery(const TString& tableName) { diff --git a/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.h b/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.h index ebfe43b5e228..92ed74341c43 100644 --- a/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.h +++ b/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.h @@ -10,15 +10,17 @@ namespace NPrivate { TMaybe CreateSecretObjectQuery(const FederatedQuery::ConnectionSetting& setting, const TString& name, - const TSigner::TPtr& signer); + const TSigner::TPtr& signer, + const TString& folderId); -TMaybe DropSecretObjectQuery(const TString& name); +TMaybe DropSecretObjectQuery(const TString& name, const TString& folderId); TString MakeCreateExternalDataSourceQuery( const FederatedQuery::ConnectionContent& connectionContent, const TSigner::TPtr& signer, const NConfig::TCommonConfig& common, - bool replaceIfExists); + bool replaceIfExists, + const TString& folderId); TString MakeDeleteExternalDataSourceQuery(const TString& sourceName); diff --git a/ydb/core/fq/libs/control_plane_proxy/actors/ydb_schema_query_actor.cpp b/ydb/core/fq/libs/control_plane_proxy/actors/ydb_schema_query_actor.cpp index 2854598fabe5..26eae1ef7670 100644 --- a/ydb/core/fq/libs/control_plane_proxy/actors/ydb_schema_query_actor.cpp +++ b/ydb/core/fq/libs/control_plane_proxy/actors/ydb_schema_query_actor.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include namespace NFq::NPrivate { @@ -418,7 +419,7 @@ class TGenerateRecoverySQLIfExternalDataSourceAlreadyExistsActor : event->IsExactNameMatch = true; - TBase::Send(NFq::ControlPlaneStorageServiceActorId(), event); + TBase::Send(::NFq::ControlPlaneStorageServiceActorId(), event); } STRICT_STFUNC(StateFunc, cFunc(NActors::TEvents::TSystem::Wakeup, TBase::HandleTimeout); @@ -493,7 +494,7 @@ class TGenerateRecoverySQLIfExternalDataTableAlreadyExistsActor : event->IsExactNameMatch = true; - TBase::Send(NFq::ControlPlaneStorageServiceActorId(), event); + TBase::Send(::NFq::ControlPlaneStorageServiceActorId(), event); } STRICT_STFUNC(StateFunc, cFunc(NActors::TEvents::TSystem::Wakeup, TBase::HandleTimeout); @@ -543,7 +544,7 @@ IActor* MakeCreateConnectionActor( TCounters& counters, TPermissions permissions, const TCommonConfig& commonConfig, - const NFq::TComputeConfig& computeConfig, + const ::NFq::TComputeConfig& computeConfig, TSigner::TPtr signer, bool withoutRollback, TMaybe connectionId) { @@ -557,10 +558,13 @@ IActor* MakeCreateConnectionActor( computeConfig](const TEvControlPlaneProxy::TEvCreateConnectionRequest::TPtr& req) -> std::vector { auto& connectionContent = req->Get()->Request.content(); + const auto& scope = req->Get()->Scope; + const TString folderId = NYdb::NFq::TScope{scope}.ParseFolder(); auto createSecretStatement = CreateSecretObjectQuery(connectionContent.setting(), connectionContent.name(), - signer); + signer, + folderId); std::vector statements; if (createSecretStatement) { @@ -603,7 +607,7 @@ IActor* MakeCreateConnectionActor( statements.push_back(TSchemaQueryTask{ .SQL = MakeCreateExternalDataSourceQuery( connectionContent, signer, commonConfig, - computeConfig.IsReplaceIfExistsSyntaxSupported()), + computeConfig.IsReplaceIfExistsSyntaxSupported(), folderId), .ScheduleErrorRecoverySQLGeneration = withoutRollback ? NoRecoverySQLGeneration() @@ -647,7 +651,7 @@ IActor* MakeModifyConnectionActor( TDuration requestTimeout, TCounters& counters, const TCommonConfig& commonConfig, - const NFq::TComputeConfig& computeConfig, + const ::NFq::TComputeConfig& computeConfig, TSigner::TPtr signer) { auto queryFactoryMethod = [signer = std::move(signer), @@ -659,13 +663,16 @@ IActor* MakeModifyConnectionActor( auto& oldConnectionContent = (*request->Get()->OldConnectionContent); auto& oldBindings = request->Get()->OldBindingContents; auto& newConnectionContent = request->Get()->Request.content(); + const auto& scope = request->Get()->Scope; + const TString folderId = NYdb::NFq::TScope{scope}.ParseFolder(); auto dropOldSecret = - DropSecretObjectQuery(oldConnectionContent.name()); + DropSecretObjectQuery(oldConnectionContent.name(), folderId); auto createNewSecret = CreateSecretObjectQuery(newConnectionContent.setting(), newConnectionContent.name(), - signer); + signer, + folderId); bool replaceSupported = computeConfig.IsReplaceIfExistsSyntaxSupported(); if (replaceSupported && @@ -673,7 +680,7 @@ IActor* MakeModifyConnectionActor( // CREATE OR REPLACE auto createSecretStatement = CreateSecretObjectQuery(newConnectionContent.setting(), - newConnectionContent.name(), signer); + newConnectionContent.name(), signer, folderId); std::vector statements; if (createSecretStatement) { @@ -683,7 +690,7 @@ IActor* MakeModifyConnectionActor( statements.push_back(TSchemaQueryTask{ .SQL = MakeCreateExternalDataSourceQuery( - newConnectionContent, signer, commonConfig, replaceSupported)}); + newConnectionContent, signer, commonConfig, replaceSupported, folderId)}); return statements; } @@ -712,7 +719,7 @@ IActor* MakeModifyConnectionActor( statements.push_back(TSchemaQueryTask{ .SQL = TString{MakeDeleteExternalDataSourceQuery(oldConnectionContent.name())}, .RollbackSQL = TString{MakeCreateExternalDataSourceQuery( - oldConnectionContent, signer, commonConfig, false)}, + oldConnectionContent, signer, commonConfig, false, folderId)}, .ShouldSkipStepOnError = IsPathDoesNotExistIssue}); if (dropOldSecret) { @@ -720,18 +727,18 @@ IActor* MakeModifyConnectionActor( .SQL = *dropOldSecret, .RollbackSQL = CreateSecretObjectQuery(oldConnectionContent.setting(), oldConnectionContent.name(), - signer), + signer, folderId), .ShouldSkipStepOnError = IsPathDoesNotExistIssue}); } if (createNewSecret) { statements.push_back(TSchemaQueryTask{.SQL = *createNewSecret, .RollbackSQL = DropSecretObjectQuery( - newConnectionContent.name())}); + newConnectionContent.name(), folderId)}); } statements.push_back( TSchemaQueryTask{.SQL = TString{MakeCreateExternalDataSourceQuery( - newConnectionContent, signer, commonConfig, false)}, + newConnectionContent, signer, commonConfig, false, folderId)}, .RollbackSQL = TString{MakeDeleteExternalDataSourceQuery( newConnectionContent.name())}}); @@ -787,15 +794,17 @@ IActor* MakeDeleteConnectionActor( const TEvControlPlaneProxy::TEvDeleteConnectionRequest::TPtr& request) -> std::vector { auto& connectionContent = *request->Get()->ConnectionContent; + const auto& scope = request->Get()->Scope; + const TString folderId = NYdb::NFq::TScope{scope}.ParseFolder(); auto dropSecret = - DropSecretObjectQuery(connectionContent.name()); + DropSecretObjectQuery(connectionContent.name(), folderId); std::vector statements = { TSchemaQueryTask{.SQL = TString{MakeDeleteExternalDataSourceQuery( connectionContent.name())}, .RollbackSQL = MakeCreateExternalDataSourceQuery( - connectionContent, signer, commonConfig, false), + connectionContent, signer, commonConfig, false, folderId), .ShouldSkipStepOnError = IsPathDoesNotExistIssue}}; if (dropSecret) { statements.push_back( @@ -803,7 +812,7 @@ IActor* MakeDeleteConnectionActor( .RollbackSQL = CreateSecretObjectQuery(connectionContent.setting(), connectionContent.name(), - signer), + signer, folderId), .ShouldSkipStepOnError = IsPathDoesNotExistIssue}); } return statements; @@ -832,7 +841,7 @@ IActor* MakeCreateBindingActor(const TActorId& proxyActorId, TDuration requestTimeout, TCounters& counters, TPermissions permissions, - const NFq::TComputeConfig& computeConfig,bool withoutRollback, + const ::NFq::TComputeConfig& computeConfig,bool withoutRollback, TMaybe bindingId) { auto queryFactoryMethod = [requestTimeout, &counters, permissions, withoutRollback, computeConfig]( @@ -916,7 +925,7 @@ IActor* MakeModifyBindingActor(const TActorId& proxyActorId, TEvControlPlaneProxy::TEvModifyBindingRequest::TPtr request, TDuration requestTimeout, TCounters& counters, - const NFq::TComputeConfig& computeConfig) { + const ::NFq::TComputeConfig& computeConfig) { auto queryFactoryMethod = [computeConfig](const TEvControlPlaneProxy::TEvModifyBindingRequest::TPtr& request) -> std::vector { From cb0bb8d0f0669efa7218253325132651a163067f Mon Sep 17 00:00:00 2001 From: Vitaly Isaev Date: Fri, 9 Aug 2024 18:37:48 +0300 Subject: [PATCH 09/56] Merge #7607 (#7627) --- .../fq/libs/cloud_audit/yq_cloud_audit_service.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ydb/core/fq/libs/cloud_audit/yq_cloud_audit_service.cpp b/ydb/core/fq/libs/cloud_audit/yq_cloud_audit_service.cpp index 5647add100b2..ded18a3816dc 100644 --- a/ydb/core/fq/libs/cloud_audit/yq_cloud_audit_service.cpp +++ b/ydb/core/fq/libs/cloud_audit/yq_cloud_audit_service.cpp @@ -65,7 +65,11 @@ std::string MapConnectionType(const FederatedQuery::ConnectionSetting::Connectio return "Monitoring"; case FederatedQuery::ConnectionSetting::ConnectionCase::kPostgresqlCluster: return "PostgreSQLCluster"; - default: + case FederatedQuery::ConnectionSetting::ConnectionCase::kGreenplumCluster: + return "GreenplumCluster"; + case FederatedQuery::ConnectionSetting::ConnectionCase::kMysqlCluster: + return "MySQLCluster"; + case FederatedQuery::ConnectionSetting::ConnectionCase::CONNECTION_NOT_SET: Y_ENSURE(false, "Invalid connection case " << i32(connectionCase)); } } @@ -76,8 +80,8 @@ std::string MapBindingType(const FederatedQuery::BindingSetting::BindingCase& bi return "YdbDataStreams"; case FederatedQuery::BindingSetting::BindingSetting::kObjectStorage: return "ObjectStorage"; - default: - Y_ENSURE(false, "Invalid connection case " << i32(bindingCase)); + case FederatedQuery::BindingSetting::BindingSetting::BINDING_NOT_SET: + Y_ENSURE(false, "Invalid binding case " << i32(bindingCase)); } } From 778dd8e1afc77c343f557f095eb5ac676b7e8594 Mon Sep 17 00:00:00 2001 From: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> Date: Wed, 14 Aug 2024 07:45:01 +0300 Subject: [PATCH 10/56] YQ revert local grpc peer value (#7739) --- ydb/core/grpc_services/local_grpc/local_grpc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ydb/core/grpc_services/local_grpc/local_grpc.h b/ydb/core/grpc_services/local_grpc/local_grpc.h index 9a67e1c2ed96..8c00724d81c1 100644 --- a/ydb/core/grpc_services/local_grpc/local_grpc.h +++ b/ydb/core/grpc_services/local_grpc/local_grpc.h @@ -73,7 +73,7 @@ class TContextBase : public NYdbGrpc::IRequestContextBase { } void FinishStreamingOk() override {} TAsyncFinishResult GetFinishFuture() override { return {}; } - TString GetPeer() const override { return "localhost"; } + TString GetPeer() const override { return {}; } bool SslServer() const override { return false; } bool IsClientLost() const override { return false; } bool IsStreamCall() const override { return false; } From 10b61f8804f75d216574e11e030d8482f1a125cd Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Sun, 18 Aug 2024 21:38:26 +0300 Subject: [PATCH 11/56] peer name has been fixed (#7809) (#7944) --- ydb/core/grpc_services/ydb_over_fq/fq_local_grpc_events.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ydb/core/grpc_services/ydb_over_fq/fq_local_grpc_events.h b/ydb/core/grpc_services/ydb_over_fq/fq_local_grpc_events.h index 097a0f46dbec..3a9a69c6a3c7 100644 --- a/ydb/core/grpc_services/ydb_over_fq/fq_local_grpc_events.h +++ b/ydb/core/grpc_services/ydb_over_fq/fq_local_grpc_events.h @@ -27,6 +27,11 @@ class TLocalGrpcFqContext : public NLocalGrpc::TContext { return TBase::GetPeerMetaValues(key); } + + TString GetPeer() const override { + return TBase::GetBaseRequest().GetPeerName(); + } + private: TString Scope_; }; From b6ac00fae4f672879b554e35b2af8c95dc60ccb9 Mon Sep 17 00:00:00 2001 From: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> Date: Mon, 19 Aug 2024 08:06:51 +0300 Subject: [PATCH 12/56] YQ-3460 fix error attempt to read after eof (#7943) --- .../s3/kqp_federated_query_ut.cpp | 41 +++++++++++++++++++ .../clickhouse/client/src/IO/ReadHelpers.cpp | 3 +- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp index b216d24f96f9..23a6eda086ba 100644 --- a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp +++ b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp @@ -1788,6 +1788,47 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { Y_UNIT_TEST(ExecuteScriptWithLargeFile) { ExecuteSelectQuery("test_bucket_execute_script_with_large_file", 5_MB, 500000); } + + Y_UNIT_TEST(TestReadEmptyFileWithCsvFormat) { + const TString externalDataSourceName = "/Root/external_data_source"; + const TString bucket = "test_bucket1"; + + CreateBucketWithObject(bucket, "test_object", ""); + + auto kikimr = NTestUtils::MakeKikimrRunner(); + + auto tc = kikimr->GetTableClient(); + auto session = tc.CreateSession().GetValueSync().GetSession(); + const TString query = fmt::format(R"( + CREATE EXTERNAL DATA SOURCE `{external_source}` WITH ( + SOURCE_TYPE="ObjectStorage", + LOCATION="{location}", + AUTH_METHOD="NONE" + );)", + "external_source"_a = externalDataSourceName, + "location"_a = GetBucketLocation(bucket) + ); + auto result = session.ExecuteSchemeQuery(query).GetValueSync(); + UNIT_ASSERT_C(result.GetStatus() == NYdb::EStatus::SUCCESS, result.GetIssues().ToString()); + + const TString sql = fmt::format(R"( + SELECT * FROM `{external_source}`.`/` + WITH ( + SCHEMA = ( + data String + ), + FORMAT = "csv_with_names" + ) + )", "external_source"_a=externalDataSourceName); + + auto db = kikimr->GetQueryClient(); + auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString()); + UNIT_ASSERT(scriptExecutionOperation.Metadata().ExecutionId); + + NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver()); + UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Completed, readyOp.Status().GetIssues().ToString()); + } } } // namespace NKikimr::NKqp diff --git a/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp b/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp index 18966814c1ec..a99aa06ec90f 100644 --- a/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp +++ b/ydb/library/yql/udfs/common/clickhouse/client/src/IO/ReadHelpers.cpp @@ -617,8 +617,9 @@ void readBackQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) template void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings) { + /// Empty string if (buf.eof()) - throwReadAfterEOF(); + return; const char delimiter = settings.delimiter; const char maybe_quote = *buf.position(); From 7f6ae296204e65c76fc6ded9a8cadb4c7187aad9 Mon Sep 17 00:00:00 2001 From: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> Date: Mon, 19 Aug 2024 08:07:59 +0300 Subject: [PATCH 13/56] YQ-3410 improve s3 url escape, added '#', '?' symbols (#7923) --- .../s3/kqp_federated_query_ut.cpp | 17 +++++++-- .../s3/actors/yql_s3_raw_read_actor.cpp | 2 +- .../providers/s3/actors/yql_s3_read_actor.cpp | 2 +- .../s3/actors/yql_s3_write_actor.cpp | 2 +- .../yql/providers/s3/common/ut/ya.make | 12 +++++++ ydb/library/yql/providers/s3/common/util.cpp | 35 +++++++++++++++++++ ydb/library/yql/providers/s3/common/util.h | 5 +++ .../yql/providers/s3/common/util_ut.cpp | 33 +++++++++++++++++ ydb/library/yql/providers/s3/common/ya.make | 4 +++ .../s3/object_listers/yql_s3_list.cpp | 2 +- 10 files changed, 107 insertions(+), 7 deletions(-) create mode 100644 ydb/library/yql/providers/s3/common/ut/ya.make create mode 100644 ydb/library/yql/providers/s3/common/util_ut.cpp diff --git a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp index 23a6eda086ba..44e75b66ae2a 100644 --- a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp +++ b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp @@ -20,11 +20,22 @@ using namespace NTestUtils; using namespace fmt::literals; Y_UNIT_TEST_SUITE(KqpFederatedQuery) { + TString GetSymbolsString(char start, char end, const TString& skip = "") { + TStringBuilder result; + for (char symbol = start; symbol <= end; ++symbol) { + if (skip.Contains(symbol)) { + continue; + } + result << symbol; + } + return result; + } + Y_UNIT_TEST(ExecuteScriptWithExternalTableResolve) { const TString externalDataSourceName = "/Root/external_data_source"; const TString externalTableName = "/Root/test_binding_resolve"; const TString bucket = "test_bucket1"; - const TString object = "test_object"; + const TString object = TStringBuilder() << "test_" << GetSymbolsString(' ', '~', "{}") << "_object"; CreateBucketWithObject(bucket, object, TEST_CONTENT); @@ -49,7 +60,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { "external_source"_a = externalDataSourceName, "external_table"_a = externalTableName, "location"_a = GetBucketLocation(bucket), - "object"_a = object + "object"_a = EscapeC(object) ); auto result = session.ExecuteSchemeQuery(query).GetValueSync(); UNIT_ASSERT_C(result.GetStatus() == NYdb::EStatus::SUCCESS, result.GetIssues().ToString()); @@ -930,7 +941,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { const TString externalDataSourceName = "/Root/external_data_source"; const TString externalTableName = "/Root/test_binding_resolve"; const TString bucket = "test_bucket1"; - const TString object = "year=1/month=2/test_object"; + const TString object = TStringBuilder() << "year=1/month=2/test_" << GetSymbolsString(' ', '~') << "_object"; const TString content = "data,year,month\ntest,1,2"; CreateBucketWithObject(bucket, object, content); diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp index d5bfdd479e2f..29381cdad996 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp @@ -167,7 +167,7 @@ class TS3ReadActor : public NActors::TActorBootstrapped, public ID const auto& authInfo = Credentials.GetAuthInfo(); LOG_D("TS3ReadActor", "Download: " << url << ", ID: " << id << ", request id: [" << requestId << "]"); Gateway->Download( - UrlEscapeRet(url, true), + NS3Util::UrlEscapeRet(url), IHTTPGateway::MakeYcHeaders(requestId, authInfo.GetToken(), {}, authInfo.GetAwsUserPwd(), authInfo.GetAwsSigV4()), 0U, std::min(object.GetSize(), SizeLimit), diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp index de1fcc7a1b4d..c8dc7c4cb7ea 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp @@ -186,7 +186,7 @@ struct TRetryStuff { const TString& requestId, const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy ) : Gateway(std::move(gateway)) - , Url(UrlEscapeRet(url, true)) + , Url(NS3Util::UrlEscapeRet(url)) , Headers(headers) , Offset(0U) , SizeLimit(sizeLimit) diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_write_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_write_actor.cpp index 76b65478da26..6c26518dfe90 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_write_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_write_actor.cpp @@ -578,7 +578,7 @@ class TS3WriteActor : public TActorBootstrapped, public IDqComput Gateway, Credentials, key, - UrlEscapeRet(Url + Path + key + MakeOutputName() + Extension, true), + NS3Util::UrlEscapeRet(Url + Path + key + MakeOutputName() + Extension), Compression, RetryPolicy, DirtyWrite, Token); keyIt->second.emplace_back(fileWrite.get()); diff --git a/ydb/library/yql/providers/s3/common/ut/ya.make b/ydb/library/yql/providers/s3/common/ut/ya.make new file mode 100644 index 000000000000..61c94727884b --- /dev/null +++ b/ydb/library/yql/providers/s3/common/ut/ya.make @@ -0,0 +1,12 @@ +UNITTEST_FOR(ydb/library/yql/providers/s3/common) + +SRCS( + util_ut.cpp +) + +PEERDIR( + ydb/library/yql/public/udf/service/stub + ydb/library/yql/sql/pg_dummy +) + +END() diff --git a/ydb/library/yql/providers/s3/common/util.cpp b/ydb/library/yql/providers/s3/common/util.cpp index b1947f1eb5ff..594040188745 100644 --- a/ydb/library/yql/providers/s3/common/util.cpp +++ b/ydb/library/yql/providers/s3/common/util.cpp @@ -1,7 +1,35 @@ #include "util.h" +#include + + namespace NYql::NS3Util { +namespace { + +inline char d2x(unsigned x) { + return (char)((x < 10) ? ('0' + x) : ('A' + x - 10)); +} + +char* UrlEscape(char* to, const char* from) { + while (*from) { + if (*from == '%' || *from == '#' || *from == '?' || (unsigned char)*from <= ' ' || (unsigned char)*from > '~') { + *to++ = '%'; + *to++ = d2x((unsigned char)*from >> 4); + *to++ = d2x((unsigned char)*from & 0xF); + } else { + *to++ = *from; + } + ++from; + } + + *to = 0; + + return to; +} + +} + TIssues AddParentIssue(const TStringBuilder& prefix, TIssues&& issues) { if (!issues) { return TIssues{}; @@ -13,4 +41,11 @@ TIssues AddParentIssue(const TStringBuilder& prefix, TIssues&& issues) { return TIssues{result}; } +TString UrlEscapeRet(const TStringBuf from) { + TString to; + to.ReserveAndResize(CgiEscapeBufLen(from.size())); + to.resize(UrlEscape(to.begin(), from.begin()) - to.data()); + return to; +} + } diff --git a/ydb/library/yql/providers/s3/common/util.h b/ydb/library/yql/providers/s3/common/util.h index a8086b5558ed..f49ab1096bb1 100644 --- a/ydb/library/yql/providers/s3/common/util.h +++ b/ydb/library/yql/providers/s3/common/util.h @@ -7,4 +7,9 @@ namespace NYql::NS3Util { TIssues AddParentIssue(const TStringBuilder& prefix, TIssues&& issues); +// Like UrlEscape with forceEscape = true +// from ydb/library/cpp/string_utils/quote/quote.h, but also escapes: +// '#', '?' +TString UrlEscapeRet(const TStringBuf from); + } diff --git a/ydb/library/yql/providers/s3/common/util_ut.cpp b/ydb/library/yql/providers/s3/common/util_ut.cpp new file mode 100644 index 000000000000..2dcbf47ceef3 --- /dev/null +++ b/ydb/library/yql/providers/s3/common/util_ut.cpp @@ -0,0 +1,33 @@ +#include "util.h" + +#include +#include + + +namespace NYql::NS3Util { + +Y_UNIT_TEST_SUITE(TestS3UrlEscape) { + // Tests on force UrlEscape copied from library/cpp/string_utils/quote/quote_ut.cpp + Y_UNIT_TEST(EscapeEscapedForce) { + TString s; + + s = "hello%3dworld"; + UNIT_ASSERT_VALUES_EQUAL(NS3Util::UrlEscapeRet(s), "hello%253dworld"); + } + + Y_UNIT_TEST(EscapeUnescapeForceRet) { + TString s; + + s = "hello%3dworld"; + UNIT_ASSERT_VALUES_EQUAL(UrlUnescapeRet(NS3Util::UrlEscapeRet(s)), "hello%3dworld"); + } + + // Test additional symbols escape + Y_UNIT_TEST(EscapeAdditionalSymbols) { + TString s = "hello#?world"; + + UNIT_ASSERT_VALUES_EQUAL(NS3Util::UrlEscapeRet(s), "hello%23%3Fworld"); + } +} + +} // namespace NYql::NS3Util diff --git a/ydb/library/yql/providers/s3/common/ya.make b/ydb/library/yql/providers/s3/common/ya.make index 426cb2339bbb..6aa2ee7748a1 100644 --- a/ydb/library/yql/providers/s3/common/ya.make +++ b/ydb/library/yql/providers/s3/common/ya.make @@ -34,3 +34,7 @@ IF (CLANG AND NOT WITH_VALGRIND) ENDIF() END() + +RECURSE_FOR_TESTS( + ut +) diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp index 32f2df4629b0..be5fd6134c2e 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp @@ -255,7 +255,7 @@ class TS3Lister : public IS3Lister { MakeFilter(listingRequest.Pattern, listingRequest.PatternType, sharedCtx); auto request = listingRequest; - request.Url = UrlEscapeRet(request.Url, true); + request.Url = NS3Util::UrlEscapeRet(request.Url); auto ctx = TListingContext{ std::move(sharedCtx), std::move(filter), From 5de15590e39e98e8b7c084ec0fc28c91a9320033 Mon Sep 17 00:00:00 2001 From: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> Date: Tue, 20 Aug 2024 13:34:05 +0300 Subject: [PATCH 14/56] YQ-3154 improve error in s3 applicator actor (#8010) --- .../kqp_finalize_script_actor.cpp | 6 +- .../s3/actors/yql_s3_applicator_actor.cpp | 60 +++++++++++++++---- 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/ydb/core/kqp/finalize_script_service/kqp_finalize_script_actor.cpp b/ydb/core/kqp/finalize_script_service/kqp_finalize_script_actor.cpp index 6927cf0549d6..a32c479ba79b 100644 --- a/ydb/core/kqp/finalize_script_service/kqp_finalize_script_actor.cpp +++ b/ydb/core/kqp/finalize_script_service/kqp_finalize_script_actor.cpp @@ -181,7 +181,11 @@ class TScriptFinalizerActor : public TActorBootstrapped { void Handle(NFq::TEvents::TEvEffectApplicationResult::TPtr& ev) { if (ev->Get()->FatalError) { - FinishScriptFinalization(Ydb::StatusIds::BAD_REQUEST, std::move(ev->Get()->Issues)); + NYql::TIssue rootIssue("Failed to commit/abort s3 multipart uploads"); + for (const NYql::TIssue& issue : ev->Get()->Issues) { + rootIssue.AddSubIssue(MakeIntrusive(issue)); + } + FinishScriptFinalization(Ydb::StatusIds::BAD_REQUEST, {rootIssue}); } else { FinishScriptFinalization(); } diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_applicator_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_applicator_actor.cpp index e8a84a6e6916..fbd1a4c296b0 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_applicator_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_applicator_actor.cpp @@ -273,16 +273,43 @@ class TS3ApplicatorActor : public NActors::TActorBootstrappedCreateRetryState()->GetNextRetryDelay(curlResponseCode, httpResponseCode); - Issues.AddIssue(TStringBuilder() << "Retry operation " << operationName << ", curl error: " << curl_easy_strerror(curlResponseCode) << ", http code: " << httpResponseCode << ", url: " << url); + bool RetryOperation(IHTTPGateway::TResult&& operationResult, const TString& url, const TString& operationName) { + const auto curlResponseCode = operationResult.CurlResponseCode; + const auto httpResponseCode = operationResult.Content.HttpResponseCode; + const auto result = RetryCount && GetRetryState(operationName)->GetNextRetryDelay(curlResponseCode, httpResponseCode); + + NYql::TIssues issues = std::move(operationResult.Issues); + TStringBuilder errorMessage = TStringBuilder() << "Retry operation " << operationName << ", curl error: " << curl_easy_strerror(curlResponseCode) << ", url: " << url; + if (const TString errorText = operationResult.Content.Extract()) { + TString errorCode; + TString message; + if (!ParseS3ErrorResponse(errorText, errorCode, message)) { + message = errorText; + } + issues.AddIssues(BuildIssues(httpResponseCode, errorCode, message)); + } else { + errorMessage << ", HTTP code: " << httpResponseCode; + } + + if (issues) { + RetryIssues.AddIssues(NS3Util::AddParentIssue(errorMessage, std::move(issues))); + } else { + RetryIssues.AddIssue(errorMessage); + } + if (result) { RetryCount--; } else { - Finish(true, RetryCount - ? TString("Number of retries exceeded limit per operation") - : TStringBuilder() << "Number of retries exceeded global limit in " << GLOBAL_RETRY_LIMIT << " retries"); + Issues.AddIssues(NS3Util::AddParentIssue( + RetryCount + ? TStringBuilder() << "Number of retries exceeded limit for operation " << operationName + : TStringBuilder() << "Number of retries exceeded global limit in " << GLOBAL_RETRY_LIMIT << " retries", + NYql::TIssues(RetryIssues) + )); + RetryIssues.Clear(); + Finish(true); } + return result; } @@ -377,7 +404,7 @@ class TS3ApplicatorActor : public NActors::TActorBootstrappedGet()->State->BuildUrl(); LOG_D("CommitMultipartUpload ERROR " << url); - if (RetryOperation(result.CurlResponseCode, result.Content.HttpResponseCode, url, "CommitMultipartUpload")) { + if (RetryOperation(std::move(result), url, "CommitMultipartUpload")) { PushCommitMultipartUpload(ev->Get()->State); } } @@ -422,7 +449,9 @@ class TS3ApplicatorActor : public NActors::TActorBootstrappedGet()->State->Url + ev->Get()->State->Prefix; if (!UnknownPrefixes.contains(prefix)) { UnknownPrefixes.insert(prefix); - Issues.AddIssue(TIssue("Unknown uncommitted upload with prefix: " + prefix)); + TIssue issue(TStringBuilder() << "Unknown uncommitted upload with prefix: " << prefix); + issue.SetCode(NYql::DEFAULT_ERROR, NYql::TSeverityIds::S_INFO); + Issues.AddIssue(std::move(issue)); } } else { pos += KeyPrefix.size(); @@ -452,7 +481,7 @@ class TS3ApplicatorActor : public NActors::TActorBootstrappedGet()->State->BuildUrl(); LOG_D("ListMultipartUploads ERROR " << url); - if (RetryOperation(result.CurlResponseCode, result.Content.HttpResponseCode, url, "ListMultipartUploads")) { + if (RetryOperation(std::move(result), url, "ListMultipartUploads")) { PushListMultipartUploads(ev->Get()->State); } } @@ -476,7 +505,7 @@ class TS3ApplicatorActor : public NActors::TActorBootstrappedGet()->State->BuildUrl(); LOG_D("AbortMultipartUpload ERROR " << url); - if (RetryOperation(result.CurlResponseCode, result.Content.HttpResponseCode, url, "AbortMultipartUpload")) { + if (RetryOperation(std::move(result), url, "AbortMultipartUpload")) { PushAbortMultipartUpload(ev->Get()->State); } } @@ -527,7 +556,7 @@ class TS3ApplicatorActor : public NActors::TActorBootstrappedGet()->State->BuildUrl(); LOG_D("ListParts ERROR " << url); - if (RetryOperation(result.CurlResponseCode, result.Content.HttpResponseCode, url, "ListParts")) { + if (RetryOperation(std::move(result), url, "ListParts")) { PushListParts(ev->Get()->State); } } @@ -597,6 +626,13 @@ class TS3ApplicatorActor : public NActors::TActorBootstrappedSend(new NActors::IEventHandle(selfId, {}, new TEvPrivate::TEvListParts(state, std::move(result)))); } + IHTTPGateway::TRetryPolicy::IRetryState::TPtr& GetRetryState(const TString& operationName) { + if (const auto it = RetryStates.find(operationName); it != RetryStates.end()) { + return it->second; + } + return RetryStates.insert({operationName, RetryPolicy->CreateRetryState()}).first->second; + } + private: NActors::TActorId ParentId; IHTTPGateway::TPtr Gateway; @@ -609,11 +645,13 @@ class TS3ApplicatorActor : public NActors::TActorBootstrapped RetryStates; ui64 HttpRequestInflight = 0; ui64 RetryCount; THashSet UnknownPrefixes; THashSet CommitUploads; NYql::TIssues Issues; + NYql::TIssues RetryIssues; std::queue RequestQueue; bool ApplicationFinished = false; }; From 434b4dd9203612deab2cdb68ca573cbccf9f0adb Mon Sep 17 00:00:00 2001 From: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> Date: Fri, 23 Aug 2024 09:17:05 +0300 Subject: [PATCH 15/56] YQ-3363 fix internal error for insert without params (#8121) --- .../providers/s3/provider/yql_s3_datasink.cpp | 9 ++++++- ydb/tests/fq/s3/test_insert.py | 25 +++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_datasink.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_datasink.cpp index 4e83e373ce01..c631f86a3c23 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_datasink.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_datasink.cpp @@ -90,12 +90,19 @@ class TS3DataSinkProvider : public TDataProviderBase { TExprNode::TPtr RewriteIO(const TExprNode::TPtr& write, TExprContext& ctx) override { const TS3Write w(write); auto settings = write->Tail().ChildrenList(); + + TExprNode::TPtr format = ExtractFormat(settings); + if (!format) { + ctx.AddError(TIssue(ctx.GetPosition(write->Pos()), "Missing format - please use WITH FORMAT when writing into S3")); + return nullptr; + } + return Build(ctx, w.Pos()) .World(w.World()) .DataSink(w.DataSink()) .Target() .Path(write->Child(2U)->Head().Tail().HeadPtr()) - .Format(ExtractFormat(settings)) + .Format(std::move(format)) .Settings(ctx.NewList(w.Pos(), std::move(settings))) .Build() .Input(write->ChildPtr(3)) diff --git a/ydb/tests/fq/s3/test_insert.py b/ydb/tests/fq/s3/test_insert.py index 9a9946e224d9..45fea6d79092 100644 --- a/ydb/tests/fq/s3/test_insert.py +++ b/ydb/tests/fq/s3/test_insert.py @@ -510,3 +510,28 @@ def test_insert_empty_object(self, kikimr, s3, client, unique_prefix): assert result_set.columns[0].type.type_id == ydb.Type.STRING assert len(result_set.rows) == 1 assert result_set.rows[0].items[0].text_value == "" + + @yq_all + @pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True) + def test_insert_without_format_error(self, kikimr, s3, client, unique_prefix): + resource = boto3.resource( + "s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key" + ) + + bucket = resource.Bucket("insert_bucket") + bucket.create(ACL='public-read-write') + bucket.objects.all().delete() + + storage_connection_name = unique_prefix + "ibucket" + client.create_storage_connection(storage_connection_name, "insert_bucket") + + sql = f''' + insert into `{storage_connection_name}`.`/test/` + select * from AS_TABLE([<|foo:123, bar:"xxx"u|>,<|foo:456, bar:"yyy"u|>]); + ''' + + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.FAILED) + issues = str(client.describe_query(query_id).result.query.issue) + + assert "Missing format - please use WITH FORMAT when writing into S3" in issues, "Incorrect Issues: " + issues From db03225c3304878a7579f4fb6d7b56e550b91936 Mon Sep 17 00:00:00 2001 From: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> Date: Mon, 26 Aug 2024 13:08:48 +0300 Subject: [PATCH 16/56] YQ-3570 added s3 wildcards validations (#8245) --- ydb/core/external_sources/object_storage.cpp | 28 ++++++--- ydb/core/external_sources/object_storage.h | 2 +- .../external_sources/object_storage_ut.cpp | 26 +++++++++ .../request_validators.h | 2 +- ydb/core/kqp/gateway/utils/scheme_helpers.cpp | 1 + .../s3/kqp_federated_query_ut.cpp | 58 ++++++++++++++++++- .../s3/kqp_federated_scheme_ut.cpp | 22 +++++++ ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp | 8 +-- ydb/core/protos/external_sources.proto | 1 + .../s3/object_listers/yql_s3_list.cpp | 1 + .../s3/object_listers/yql_s3_path.cpp | 33 +++++++++-- .../providers/s3/object_listers/yql_s3_path.h | 1 + .../s3/object_listers/yql_s3_path_ut.cpp | 21 +++++++ .../s3/provider/yql_s3_io_discovery.cpp | 9 +++ 14 files changed, 194 insertions(+), 19 deletions(-) diff --git a/ydb/core/external_sources/object_storage.cpp b/ydb/core/external_sources/object_storage.cpp index 37e5eaa0fa1b..32f90bc976f3 100644 --- a/ydb/core/external_sources/object_storage.cpp +++ b/ydb/core/external_sources/object_storage.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -64,7 +65,7 @@ struct TObjectStorageExternalSource : public IExternalSource { } } - if (auto issues = Validate(schema, objectStorage, PathsLimit)) { + if (auto issues = Validate(schema, objectStorage, PathsLimit, general.location())) { ythrow TExternalSourceException() << issues.ToString(); } @@ -133,11 +134,18 @@ struct TObjectStorageExternalSource : public IExternalSource { } template - static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit) { + static NYql::TIssues Validate(const TScheme& schema, const TObjectStorage& objectStorage, size_t pathsLimit, const TString& location) { NYql::TIssues issues; - issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting())); + if (TString errorString = NYql::NS3::ValidateWildcards(location)) { + issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains invalid wildcard: " << errorString)); + } + const bool hasPartitioning = objectStorage.projection_size() || objectStorage.partitioned_by_size(); + issues.AddIssues(ValidateFormatSetting(objectStorage.format(), objectStorage.format_setting(), location, hasPartitioning)); issues.AddIssues(ValidateRawFormat(objectStorage.format(), schema, objectStorage.partitioned_by())); - if (objectStorage.projection_size() || objectStorage.partitioned_by_size()) { + if (hasPartitioning) { + if (NYql::NS3::HasWildcards(location)) { + issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Location '" << location << "' contains wildcards")); + } try { TVector partitionedBy{objectStorage.partitioned_by().begin(), objectStorage.partitioned_by().end()}; issues.AddIssues(ValidateProjectionColumns(schema, partitionedBy)); @@ -157,11 +165,17 @@ struct TObjectStorageExternalSource : public IExternalSource { return issues; } - static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map& formatSetting) { + static NYql::TIssues ValidateFormatSetting(const TString& format, const google::protobuf::Map& formatSetting, const TString& location, bool hasPartitioning) { NYql::TIssues issues; issues.AddIssues(ValidateDateFormatSetting(formatSetting)); for (const auto& [key, value]: formatSetting) { if (key == "file_pattern"sv) { + if (TString errorString = NYql::NS3::ValidateWildcards(value)) { + issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "File pattern '" << value << "' contains invalid wildcard: " << errorString)); + } + if (value && !hasPartitioning && !location.EndsWith("/")) { + issues.AddIssue(MakeErrorIssue(Ydb::StatusIds::BAD_REQUEST, "Path pattern cannot be used with file_pattern")); + } continue; } @@ -616,8 +630,8 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector(hostnamePatterns, actorSystem, pathsLimit, std::move(credentialsFactory), enableInfer); } -NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit) { - return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit); +NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location) { + return TObjectStorageExternalSource::Validate(schema, objectStorage, pathsLimit, location); } NYql::TIssues ValidateDateFormatSetting(const google::protobuf::Map& formatSetting, bool matchAllSettings) { diff --git a/ydb/core/external_sources/object_storage.h b/ydb/core/external_sources/object_storage.h index 6e08d8e65181..ae99009e5cde 100644 --- a/ydb/core/external_sources/object_storage.h +++ b/ydb/core/external_sources/object_storage.h @@ -15,7 +15,7 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector credentialsFactory, bool enableInfer); -NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit); +NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location); NYql::TIssues ValidateDateFormatSetting(const google::protobuf::Map& formatSetting, bool matchAllSettings = false); diff --git a/ydb/core/external_sources/object_storage_ut.cpp b/ydb/core/external_sources/object_storage_ut.cpp index 37e67cf63497..887d1c1d4bbf 100644 --- a/ydb/core/external_sources/object_storage_ut.cpp +++ b/ydb/core/external_sources/object_storage_ut.cpp @@ -29,6 +29,32 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) { general.mutable_attributes()->insert({"projection.h", "b"}); UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Partition by must always be specified"); } + + Y_UNIT_TEST(WildcardsValidation) { + auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false); + NKikimrExternalSources::TSchema schema; + + { // location + NKikimrExternalSources::TGeneral general; + general.set_location("{"); + UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '{' contains invalid wildcard:"); + } + + { // file pattern + NKikimrExternalSources::TGeneral general; + general.mutable_attributes()->insert({"file_pattern", "{"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "File pattern '{' contains invalid wildcard:"); + general.set_location("/test_file"); + UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Path pattern cannot be used with file_pattern"); + } + + { // partitioned by + NKikimrExternalSources::TGeneral general; + general.set_location("*"); + general.mutable_attributes()->insert({"partitioned_by", "[year]"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(source->Pack(schema, general), NExternalSource::TExternalSourceException, "Location '*' contains wildcards"); + } + } } } // NKikimr diff --git a/ydb/core/fq/libs/control_plane_storage/request_validators.h b/ydb/core/fq/libs/control_plane_storage/request_validators.h index c17ac4d41a16..557a2c81a400 100644 --- a/ydb/core/fq/libs/control_plane_storage/request_validators.h +++ b/ydb/core/fq/libs/control_plane_storage/request_validators.h @@ -121,7 +121,7 @@ NYql::TIssues ValidateBinding(const T& ev, size_t maxSize, const TSetsecond.NotNull); } NKikimrExternalSources::TGeneral general; + general.set_location(settings.Location); auto& attributes = *general.mutable_attributes(); for (const auto& [key, value]: settings.SourceTypeParameters) { attributes.insert({key, value}); diff --git a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp index 44e75b66ae2a..736fe7ea81c5 100644 --- a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp +++ b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_query_ut.cpp @@ -35,7 +35,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { const TString externalDataSourceName = "/Root/external_data_source"; const TString externalTableName = "/Root/test_binding_resolve"; const TString bucket = "test_bucket1"; - const TString object = TStringBuilder() << "test_" << GetSymbolsString(' ', '~', "{}") << "_object"; + const TString object = TStringBuilder() << "test_" << GetSymbolsString(' ', '~', "*?{}") << "_object"; CreateBucketWithObject(bucket, object, TEST_CONTENT); @@ -1802,7 +1802,7 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { Y_UNIT_TEST(TestReadEmptyFileWithCsvFormat) { const TString externalDataSourceName = "/Root/external_data_source"; - const TString bucket = "test_bucket1"; + const TString bucket = "test_bucket12"; CreateBucketWithObject(bucket, "test_object", ""); @@ -1840,6 +1840,60 @@ Y_UNIT_TEST_SUITE(KqpFederatedQuery) { NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver()); UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Completed, readyOp.Status().GetIssues().ToString()); } + + Y_UNIT_TEST(TestWildcardValidation) { + const TString bucket = "test_bucket13"; + + CreateBucket(bucket); + + auto kikimr = NTestUtils::MakeKikimrRunner(); + + auto tc = kikimr->GetTableClient(); + auto session = tc.CreateSession().GetValueSync().GetSession(); + const TString query = fmt::format(R"( + CREATE EXTERNAL DATA SOURCE `/Root/external_data_source` WITH ( + SOURCE_TYPE="ObjectStorage", + LOCATION="{location}", + AUTH_METHOD="NONE" + );)", + "location"_a = GetBucketLocation(bucket) + ); + auto result = session.ExecuteSchemeQuery(query).GetValueSync(); + UNIT_ASSERT_C(result.GetStatus() == NYdb::EStatus::SUCCESS, result.GetIssues().ToString()); + + auto db = kikimr->GetQueryClient(); + + { // path validation + const TString sql = R"( + SELECT * FROM `/Root/external_data_source`.`/{` WITH ( + SCHEMA = (data String), + FORMAT = "csv_with_names" + ))"; + + auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString()); + + NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver()); + UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString()); + UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "Path '/{' contains invalid wildcard:"); + } + + { // file pattern validation + const TString sql = R"( + SELECT * FROM `/Root/external_data_source`.`/` WITH ( + SCHEMA = (data String), + FORMAT = "csv_with_names", + FILE_PATTERN = "{" + ))"; + + auto scriptExecutionOperation = db.ExecuteScript(sql).ExtractValueSync(); + UNIT_ASSERT_VALUES_EQUAL_C(scriptExecutionOperation.Status().GetStatus(), EStatus::SUCCESS, scriptExecutionOperation.Status().GetIssues().ToString()); + + NYdb::NQuery::TScriptExecutionOperation readyOp = WaitScriptExecutionOperation(scriptExecutionOperation.Id(), kikimr->GetDriver()); + UNIT_ASSERT_EQUAL_C(readyOp.Metadata().ExecStatus, EExecStatus::Failed, readyOp.Status().GetIssues().ToString()); + UNIT_ASSERT_STRING_CONTAINS(readyOp.Status().GetIssues().ToString(), "File pattern '{' contains invalid wildcard:"); + } + } } } // namespace NKikimr::NKqp diff --git a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp index 39e53fd71eef..57f1694e1ea4 100644 --- a/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp +++ b/ydb/core/kqp/ut/federated_query/s3/kqp_federated_scheme_ut.cpp @@ -215,6 +215,28 @@ Y_UNIT_TEST_SUITE(KqpFederatedSchemeTest) { }; TestInvalidDropForExternalTableWithAuth(queryClientExecutor, "generic_query"); } + + Y_UNIT_TEST(ExternalTableDdlLocationValidation) { + auto kikimr = NTestUtils::MakeKikimrRunner(); + auto db = kikimr->GetTableClient(); + auto session = db.CreateSession().GetValueSync().GetSession(); + auto query = TStringBuilder() << R"( + CREATE EXTERNAL DATA SOURCE `/Root/ExternalDataSource` WITH ( + SOURCE_TYPE="ObjectStorage", + LOCATION="my-bucket", + AUTH_METHOD="NONE" + ); + CREATE EXTERNAL TABLE `/Root/ExternalTable` ( + Key Uint64, + Value String + ) WITH ( + DATA_SOURCE="/Root/ExternalDataSource", + LOCATION="{" + );)"; + auto result = session.ExecuteSchemeQuery(query).GetValueSync(); + UNIT_ASSERT_VALUES_EQUAL(result.GetStatus(), EStatus::SCHEME_ERROR); + UNIT_ASSERT_STRING_CONTAINS(result.GetIssues().ToString(), "Location '{' contains invalid wildcard:"); + } } } // namespace NKikimr::NKqp diff --git a/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp b/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp index e329feaf56f7..c85a6a00dddf 100644 --- a/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp +++ b/ydb/core/kqp/ut/scheme/kqp_scheme_ut.cpp @@ -5146,7 +5146,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) { month Int64 NOT NULL ) WITH ( DATA_SOURCE=")" << externalDataSourceName << R"(", - LOCATION="/folder1/*", + LOCATION="/folder1/", FORMAT="json_as_string", `projection.enabled`="true", `projection.year.type`="integer", @@ -5171,7 +5171,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) { UNIT_ASSERT(externalTable.ExternalTableInfo); UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.ColumnsSize(), 4); UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetDataSourcePath(), externalDataSourceName); - UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/*"); + UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/"); } Y_UNIT_TEST(CreateExternalTableWithUpperCaseSettings) { @@ -5194,7 +5194,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) { Month Int64 NOT NULL ) WITH ( DATA_SOURCE=")" << externalDataSourceName << R"(", - LOCATION="/folder1/*", + LOCATION="/folder1/", FORMAT="json_as_string", `projection.enabled`="true", `projection.Year.type`="integer", @@ -5219,7 +5219,7 @@ Y_UNIT_TEST_SUITE(KqpScheme) { UNIT_ASSERT(externalTable.ExternalTableInfo); UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.ColumnsSize(), 4); UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetDataSourcePath(), externalDataSourceName); - UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/*"); + UNIT_ASSERT_VALUES_EQUAL(externalTable.ExternalTableInfo->Description.GetLocation(), "/folder1/"); } Y_UNIT_TEST(DoubleCreateExternalTable) { diff --git a/ydb/core/protos/external_sources.proto b/ydb/core/protos/external_sources.proto index 2115da12de95..9f01d56e7120 100644 --- a/ydb/core/protos/external_sources.proto +++ b/ydb/core/protos/external_sources.proto @@ -11,6 +11,7 @@ message TSchema { message TGeneral { map attributes = 1 [(Ydb.size).le = 100]; + optional string location = 2; } message TObjectStorage { diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp index be5fd6134c2e..f62c12ea1d4c 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp @@ -47,6 +47,7 @@ std::pair MakeFilterRegexp(const TString& regex, } else { re = std::make_shared(re2::StringPiece(regex), RE2::Options()); } + Y_ENSURE(re->ok()); const size_t numGroups = re->NumberOfCapturingGroups(); YQL_CLOG(DEBUG, ProviderS3) diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp index e746aeeddec3..e472caef152f 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.cpp @@ -52,13 +52,21 @@ TString RegexFromWildcards(const std::string_view& pattern) { for (const char& c : escaped) { switch (c) { case '{': - result << "(?:"; - group = true; + if (group) { + result << "\\{"; + } else { + result << "(?:"; + group = true; + } slash = false; break; case '}': - result << ')'; - group = false; + if (group) { + result << ')'; + group = false; + } else { + result << "\\}"; + } slash = false; break; case ',': @@ -89,7 +97,24 @@ TString RegexFromWildcards(const std::string_view& pattern) { break; } } + Y_ENSURE(!group, "Found unterminated group"); + Y_ENSURE(!slash, "Expected symbol after slash"); return result; } +TString ValidateWildcards(const std::string_view& pattern) { + std::optional groupStart; + for (size_t i = 0; i < pattern.size(); ++i) { + if (pattern[i] == '{' && !groupStart) { + groupStart = i; + } else if (pattern[i] == '}') { + groupStart = std::nullopt; + } + } + if (groupStart) { + return TStringBuilder() << "found unterminated group start at position " << *groupStart; + } + return {}; +} + } diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h index c39f476f8893..b5266607558f 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_path.h @@ -19,5 +19,6 @@ TString EscapeRegex(const TString& str); TString EscapeRegex(const std::string_view& str); TString RegexFromWildcards(const std::string_view& pattern); +TString ValidateWildcards(const std::string_view& pattern); } diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp index 1b452a268364..d4aaa47800f5 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_path_ut.cpp @@ -28,6 +28,27 @@ Y_UNIT_TEST_SUITE(TPathTests) { UNIT_ASSERT_VALUES_EQUAL(NormalizePath("/a/b/c/"), "a/b/c/"); UNIT_ASSERT_VALUES_EQUAL(NormalizePath("///a/b/c///"), "a/b/c/"); } + + void TestRegexFromWildcardsSuccess(const TString& wildcards, const TString& expectedRegex) { + TString errorString = ValidateWildcards(wildcards); + UNIT_ASSERT_C(errorString.empty(), errorString); + UNIT_ASSERT_VALUES_EQUAL(RegexFromWildcards(wildcards), expectedRegex); + } + + void TestRegexFromWildcardsFail(const TString& wildcards, const TString& expectedException, const TString& expectedError) { + UNIT_ASSERT_STRING_CONTAINS(ValidateWildcards(wildcards), expectedError); + UNIT_ASSERT_EXCEPTION_CONTAINS(RegexFromWildcards(wildcards), yexception, expectedException); + } + + Y_UNIT_TEST(TestRegexFromWildcards) { + TestRegexFromWildcardsSuccess("first,test\\_{alt1,alt2}_text", "first\\,test\\\\_(?:alt1|alt2)_text"); + TestRegexFromWildcardsSuccess("hello.*world?str", "hello\\..*world.str"); + TestRegexFromWildcardsSuccess("many_{},{alt1,al?t2,al*t3},{alt4}_alts", "many_(?:)\\,(?:alt1|al.t2|al.*t3)\\,(?:alt4)_alts"); + TestRegexFromWildcardsSuccess("hello}{}}world", "hello\\}(?:)\\}world"); + TestRegexFromWildcardsSuccess("hello{{{}world", "hello(?:\\{\\{)world"); + + TestRegexFromWildcardsFail("hello{}}{world", "Found unterminated group", "found unterminated group start at position 8"); + } } } diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp index 2c1ee3313622..436a116ecdf1 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp @@ -732,6 +732,10 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { if (!FindFilePattern(settings, ctx, filePattern)) { return false; } + if (TString errorString = NS3::ValidateWildcards(filePattern)) { + ctx.AddError(TIssue(ctx.GetPosition(read.Pos()), TStringBuilder() << "File pattern '" << filePattern << "' contains invalid wildcard: " << errorString)); + return false; + } const TString effectiveFilePattern = filePattern ? filePattern : "*"; TVector paths; @@ -763,6 +767,11 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { } for (const auto& path : paths) { + if (TString errorString = NS3::ValidateWildcards(path)) { + ctx.AddError(TIssue(ctx.GetPosition(read.Pos()), TStringBuilder() << "Path '" << path << "' contains invalid wildcard: " << errorString)); + return false; + } + // each path in CONCAT() can generate multiple list requests for explicit partitioning TVector reqs; From a8059642fc8c664eed58329cfb68d660150db926 Mon Sep 17 00:00:00 2001 From: yumkam Date: Mon, 26 Aug 2024 15:41:26 +0300 Subject: [PATCH 17/56] streamlookup fixes (#8258) --- .../dq_input_transform_lookup.cpp | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp index 38576688ca16..646f49130507 100644 --- a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp +++ b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp @@ -168,9 +168,8 @@ class TInputTransformStreamLookupBase const auto maxKeysInRequest = LookupSource.first->GetMaxSupportedKeysInRequest(); IDqAsyncLookupSource::TUnboxedValueMap keysForLookup{maxKeysInRequest, KeyTypeHelper->GetValueHash(), KeyTypeHelper->GetValueEqual()}; while ( - ((InputFlowFetchStatus = FetchWideInputValue(inputRowItems)) == NUdf::EFetchStatus::Ok) && - (keysForLookup.size() < maxKeysInRequest) - ) { + (keysForLookup.size() < maxKeysInRequest) && + ((InputFlowFetchStatus = FetchWideInputValue(inputRowItems)) == NUdf::EFetchStatus::Ok)) { NUdf::TUnboxedValue* keyItems; NUdf::TUnboxedValue key = HolderFactory.CreateDirectArrayHolder(InputJoinColumns.size(), keyItems); for (size_t i = 0; i != InputJoinColumns.size(); ++i) { @@ -384,6 +383,25 @@ THashMap GetNameToIndex(const ::google::protobuf::RepeatedPt return result; } +THashMap GetNameToIndex(const NMiniKQL::TStructType* type) { + THashMap result; + for (ui32 i = 0; i != type->GetMembersCount(); ++i) { + result[type->GetMemberName(i)] = i; + } + return result; +} + +TVector GetJoinColumnIndexes(const ::google::protobuf::RepeatedPtrField& names, const THashMap& joinColumns) { + TVector result; + result.reserve(joinColumns.size()); + for (int i = 0; i != names.size(); ++i) { + if (auto p = joinColumns.FindPtr(names[i])) { + result.push_back(*p); + } + } + return result; +} + TVector GetJoinColumnIndexes(const NMiniKQL::TStructType* type, const THashMap& joinColumns) { TVector result; result.reserve(joinColumns.size()); @@ -411,14 +429,15 @@ std::pair CreateInputTransformStre const auto rightRowType = DeserializeStructType(settings.GetRightSource().GetSerializedRowType(), args.TypeEnv); - auto leftJoinColumns = GetNameToIndex(settings.GetLeftJoinKeyNames()); + auto inputColumns = GetNameToIndex(narrowInputRowType); auto rightJoinColumns = GetNameToIndex(settings.GetRightJoinKeyNames()); - Y_ABORT_UNLESS(leftJoinColumns.size() == rightJoinColumns.size()); - auto leftJoinColumnIndexes = GetJoinColumnIndexes(narrowInputRowType, leftJoinColumns); - Y_ABORT_UNLESS(leftJoinColumnIndexes.size() == leftJoinColumns.size()); + auto leftJoinColumnIndexes = GetJoinColumnIndexes( + settings.GetLeftJoinKeyNames(), + inputColumns); auto rightJoinColumnIndexes = GetJoinColumnIndexes(rightRowType, rightJoinColumns); Y_ABORT_UNLESS(rightJoinColumnIndexes.size() == rightJoinColumns.size()); + Y_ABORT_UNLESS(leftJoinColumnIndexes.size() == rightJoinColumnIndexes.size()); const auto& [lookupKeyType, lookupPayloadType] = SplitLookupTableColumns(rightRowType, rightJoinColumns, args.TypeEnv); const auto& outputColumnsOrder = CategorizeOutputRowItems( From 8105899b4cd772bc84bf4a23441d809de7161df6 Mon Sep 17 00:00:00 2001 From: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> Date: Tue, 27 Aug 2024 11:06:07 +0300 Subject: [PATCH 18/56] YQ-3566 fix sql injection in create binding request (#8274) --- ydb/core/fq/libs/common/util.cpp | 6 ++++ ydb/core/fq/libs/common/util_ut.cpp | 4 +++ ydb/tests/fq/s3/test_bindings_1.py | 48 +++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) diff --git a/ydb/core/fq/libs/common/util.cpp b/ydb/core/fq/libs/common/util.cpp index 61d2ea43bbda..b1d6399165f3 100644 --- a/ydb/core/fq/libs/common/util.cpp +++ b/ydb/core/fq/libs/common/util.cpp @@ -62,18 +62,24 @@ class TIssueDatabaseRemover { TString DatabasePath; }; +void EscapeBackslashes(TString& value) { + SubstGlobal(value, "\\", "\\\\"); +} + } TString EscapeString(const TString& value, const TString& enclosingSeq, const TString& replaceWith) { auto escapedValue = value; + EscapeBackslashes(escapedValue); SubstGlobal(escapedValue, enclosingSeq, replaceWith); return escapedValue; } TString EscapeString(const TString& value, char enclosingChar) { auto escapedValue = value; + EscapeBackslashes(escapedValue); SubstGlobal(escapedValue, TString{enclosingChar}, TStringBuilder{} << '\\' << enclosingChar); diff --git a/ydb/core/fq/libs/common/util_ut.cpp b/ydb/core/fq/libs/common/util_ut.cpp index 3e5a7d3e1202..3e209c1349c3 100644 --- a/ydb/core/fq/libs/common/util_ut.cpp +++ b/ydb/core/fq/libs/common/util_ut.cpp @@ -23,7 +23,9 @@ Y_UNIT_TEST_SUITE(EscapingBasics) { UNIT_ASSERT_VALUES_EQUAL(EscapeString("some_secret1", '"'), "some_secret1"); UNIT_ASSERT_VALUES_EQUAL(EscapeString("some_secret1", "}+{", "[*]"), "some_secret1"); UNIT_ASSERT_VALUES_EQUAL(EscapeString("some\"_\"secret1", '"'), "some\\\"_\\\"secret1"); + UNIT_ASSERT_VALUES_EQUAL(EscapeString("some\"_\\\"secret1", '"'), "some\\\"_\\\\\\\"secret1"); UNIT_ASSERT_VALUES_EQUAL(EscapeString("some}+{_}+{secret1", "}+{", "[*]"), "some[*]_[*]secret1"); + UNIT_ASSERT_VALUES_EQUAL(EscapeString("some}+{\\}+{secret1", "}+{", "[*]"), "some[*]\\\\[*]secret1"); } Y_UNIT_TEST(EncloseAndEscapeStringShouldWork) { @@ -31,7 +33,9 @@ Y_UNIT_TEST_SUITE(EscapingBasics) { UNIT_ASSERT_VALUES_EQUAL(EncloseAndEscapeString("some_secret1\nsome_secret2", "}+{", "[*]"), "}+{some_secret1\nsome_secret2}+{"); UNIT_ASSERT_VALUES_EQUAL(EncloseAndEscapeString("some\"_\"secret1", '"'), "\"some\\\"_\\\"secret1\""); + UNIT_ASSERT_VALUES_EQUAL(EncloseAndEscapeString("some\"_\\\"secret1", '"'), "\"some\\\"_\\\\\\\"secret1\""); UNIT_ASSERT_VALUES_EQUAL(EncloseAndEscapeString("some_secret1}+{\n}+{some_secret2", "}+{", "[*]"), "}+{some_secret1[*]\n[*]some_secret2}+{"); + UNIT_ASSERT_VALUES_EQUAL(EncloseAndEscapeString("some_secret1}+{\\}+{some_secret2", "}+{", "[*]"), "}+{some_secret1[*]\\\\[*]some_secret2}+{"); } } diff --git a/ydb/tests/fq/s3/test_bindings_1.py b/ydb/tests/fq/s3/test_bindings_1.py index cb4cbb8aaf5f..0ecbf211c5bc 100644 --- a/ydb/tests/fq/s3/test_bindings_1.py +++ b/ydb/tests/fq/s3/test_bindings_1.py @@ -277,3 +277,51 @@ def test_raw_empty_schema_binding(self, kikimr, client, unique_prefix): assert "Only one column in schema supported in raw format" in str(binding_response.issues), str( binding_response.issues ) + + @yq_all + @pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True) + def test_binding_with_backslash_in_location(self, s3, client, unique_prefix): + resource = boto3.resource( + "s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key" + ) + + bucket = resource.Bucket("backslash_bucket") + bucket.create(ACL='public-read') + + s3_client = boto3.client( + "s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key" + ) + + data = R'''data +test''' + s3_client.put_object(Body=data, Bucket='backslash_bucket', Key='\\', ContentType='text/plain') + + connection_response = client.create_storage_connection(unique_prefix + "backslash_bucket", "backslash_bucket") + + data_type = ydb.Column(name="data", type=ydb.Type(type_id=ydb.Type.PrimitiveTypeId.UTF8)) + storage_binding_name = unique_prefix + "binding_name" + client.create_object_storage_binding( + name=storage_binding_name, + path="\\", + format="csv_with_names", + connection_id=connection_response.result.connection_id, + columns=[data_type], + ) + + sql = fR''' + SELECT * + FROM bindings.{storage_binding_name}; + ''' + + query_id = client.create_query( + "simple", sql, type=fq.QueryContent.QueryType.ANALYTICS, pg_syntax=True + ).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.COMPLETED) + + data = client.get_result_data(query_id) + result_set = data.result.result_set + logging.debug(str(result_set)) + assert len(result_set.columns) == 1 + assert result_set.columns[0].name == "data" + assert len(result_set.rows) == 1 + assert result_set.rows[0].items[0].text_value == "test" From a86ef583b0a0117643d482fc115542038c7490d0 Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Tue, 27 Aug 2024 11:18:09 +0300 Subject: [PATCH 19/56] listing fix 1724681819 (#8281) Co-authored-by: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> --- .../external_source_factory.cpp | 5 +- .../external_source_factory.h | 3 +- ydb/core/external_sources/object_storage.cpp | 13 ++-- ydb/core/external_sources/object_storage.h | 3 +- .../external_sources/object_storage_ut.cpp | 8 +- ydb/core/fq/libs/actors/run_actor.cpp | 2 +- ydb/core/fq/libs/init/init.cpp | 2 +- .../kqp/compute_actor/kqp_compute_actor.cpp | 2 +- ydb/core/kqp/host/kqp_host.cpp | 4 +- ydb/core/tx/schemeshard/schemeshard_impl.cpp | 5 +- .../s3/actors/yql_s3_actors_factory_impl.cpp | 7 +- .../s3/actors/yql_s3_raw_read_actor.cpp | 14 +++- .../s3/actors/yql_s3_raw_read_actor.h | 3 +- .../providers/s3/actors/yql_s3_read_actor.cpp | 17 +++-- .../providers/s3/actors/yql_s3_read_actor.h | 6 +- .../s3/actors/yql_s3_source_factory.h | 5 +- .../s3/actors/yql_s3_source_queue.cpp | 16 ++-- .../providers/s3/actors/yql_s3_source_queue.h | 3 +- .../actors_factory/yql_s3_actors_factory.cpp | 4 +- .../s3/actors_factory/yql_s3_actors_factory.h | 3 +- .../yql/providers/s3/object_listers/ya.make | 1 + .../s3/object_listers/yql_s3_list.cpp | 54 ++++++++----- .../providers/s3/object_listers/yql_s3_list.h | 5 +- .../s3/provider/yql_s3_dq_integration.cpp | 3 +- .../s3/provider/yql_s3_io_discovery.cpp | 3 +- .../s3/provider/yql_s3_listing_strategy.cpp | 8 +- .../providers/s3/provider/yql_s3_provider.cpp | 5 +- .../providers/s3/provider/yql_s3_provider.h | 3 +- ydb/library/yql/tools/dqrun/dqrun.cpp | 2 +- ydb/tests/fq/s3/test_s3_1.py | 76 +++++++++++++++++++ ydb/tests/tools/fq_runner/kikimr_runner.py | 2 +- 31 files changed, 217 insertions(+), 70 deletions(-) diff --git a/ydb/core/external_sources/external_source_factory.cpp b/ydb/core/external_sources/external_source_factory.cpp index c0be11d62eab..ee6884383f63 100644 --- a/ydb/core/external_sources/external_source_factory.cpp +++ b/ydb/core/external_sources/external_source_factory.cpp @@ -36,12 +36,13 @@ IExternalSourceFactory::TPtr CreateExternalSourceFactory(const std::vector credentialsFactory, - bool enableInfer) { + bool enableInfer, + bool allowLocalFiles) { std::vector hostnamePatternsRegEx(hostnamePatterns.begin(), hostnamePatterns.end()); return MakeIntrusive(TMap{ { ToString(NYql::EDatabaseType::ObjectStorage), - CreateObjectStorageExternalSource(hostnamePatternsRegEx, actorSystem, pathsLimit, std::move(credentialsFactory), enableInfer) + CreateObjectStorageExternalSource(hostnamePatternsRegEx, actorSystem, pathsLimit, std::move(credentialsFactory), enableInfer, allowLocalFiles) }, { ToString(NYql::EDatabaseType::ClickHouse), diff --git a/ydb/core/external_sources/external_source_factory.h b/ydb/core/external_sources/external_source_factory.h index 7b49e0e74544..27b5ebd8e0ec 100644 --- a/ydb/core/external_sources/external_source_factory.h +++ b/ydb/core/external_sources/external_source_factory.h @@ -15,6 +15,7 @@ IExternalSourceFactory::TPtr CreateExternalSourceFactory(const std::vector credentialsFactory = nullptr, - bool enableInfer = false); + bool enableInfer = false, + bool allowLocalFiles = false); } diff --git a/ydb/core/external_sources/object_storage.cpp b/ydb/core/external_sources/object_storage.cpp index 32f90bc976f3..93e2189d3095 100644 --- a/ydb/core/external_sources/object_storage.cpp +++ b/ydb/core/external_sources/object_storage.cpp @@ -34,12 +34,14 @@ struct TObjectStorageExternalSource : public IExternalSource { NActors::TActorSystem* actorSystem, size_t pathsLimit, std::shared_ptr credentialsFactory, - bool enableInfer) + bool enableInfer, + bool allowLocalFiles) : HostnamePatterns(hostnamePatterns) , PathsLimit(pathsLimit) , ActorSystem(actorSystem) , CredentialsFactory(std::move(credentialsFactory)) , EnableInfer(enableInfer) + , AllowLocalFiles(allowLocalFiles) {} virtual TString Pack(const NKikimrExternalSources::TSchema& schema, @@ -290,7 +292,6 @@ struct TObjectStorageExternalSource : public IExternalSource { }; virtual NThreading::TFuture> LoadDynamicMetadata(std::shared_ptr meta) override { - Y_UNUSED(ActorSystem); auto format = meta->Attributes.FindPtr("format"); if (!format || !meta->Attributes.contains("withinfer")) { return NThreading::MakeFuture(std::move(meta)); @@ -328,7 +329,7 @@ struct TObjectStorageExternalSource : public IExternalSource { .Url = meta->DataSourceLocation, .Credentials = credentials, .Pattern = meta->TableLocation, - }, Nothing(), false); + }, Nothing(), AllowLocalFiles, ActorSystem); auto afterListing = s3Lister->Next().Apply([path = meta->TableLocation](const NThreading::TFuture& listResFut) { auto& listRes = listResFut.GetValue(); if (std::holds_alternative(listRes)) { @@ -617,6 +618,7 @@ struct TObjectStorageExternalSource : public IExternalSource { NActors::TActorSystem* ActorSystem = nullptr; std::shared_ptr CredentialsFactory; const bool EnableInfer = false; + const bool AllowLocalFiles; }; } @@ -626,8 +628,9 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector credentialsFactory, - bool enableInfer) { - return MakeIntrusive(hostnamePatterns, actorSystem, pathsLimit, std::move(credentialsFactory), enableInfer); + bool enableInfer, + bool allowLocalFiles) { + return MakeIntrusive(hostnamePatterns, actorSystem, pathsLimit, std::move(credentialsFactory), enableInfer, allowLocalFiles); } NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location) { diff --git a/ydb/core/external_sources/object_storage.h b/ydb/core/external_sources/object_storage.h index ae99009e5cde..74de7a69eb87 100644 --- a/ydb/core/external_sources/object_storage.h +++ b/ydb/core/external_sources/object_storage.h @@ -13,7 +13,8 @@ IExternalSource::TPtr CreateObjectStorageExternalSource(const std::vector credentialsFactory, - bool enableInfer); + bool enableInfer, + bool allowLocalFiles); NYql::TIssues Validate(const FederatedQuery::Schema& schema, const FederatedQuery::ObjectStorageBinding::Subset& objectStorage, size_t pathsLimit, const TString& location); diff --git a/ydb/core/external_sources/object_storage_ut.cpp b/ydb/core/external_sources/object_storage_ut.cpp index 887d1c1d4bbf..23fcc0e214a6 100644 --- a/ydb/core/external_sources/object_storage_ut.cpp +++ b/ydb/core/external_sources/object_storage_ut.cpp @@ -8,14 +8,14 @@ namespace NKikimr { Y_UNIT_TEST_SUITE(ObjectStorageTest) { Y_UNIT_TEST(SuccessValidation) { - auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false); + auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false, false); NKikimrExternalSources::TSchema schema; NKikimrExternalSources::TGeneral general; UNIT_ASSERT_NO_EXCEPTION(source->Pack(schema, general)); } Y_UNIT_TEST(FailedCreate) { - auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false); + auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false, false); NKikimrExternalSources::TSchema schema; NKikimrExternalSources::TGeneral general; general.mutable_attributes()->insert({"a", "b"}); @@ -23,7 +23,7 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) { } Y_UNIT_TEST(FailedValidation) { - auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false); + auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false, false); NKikimrExternalSources::TSchema schema; NKikimrExternalSources::TGeneral general; general.mutable_attributes()->insert({"projection.h", "b"}); @@ -31,7 +31,7 @@ Y_UNIT_TEST_SUITE(ObjectStorageTest) { } Y_UNIT_TEST(WildcardsValidation) { - auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false); + auto source = NExternalSource::CreateObjectStorageExternalSource({}, nullptr, 1000, nullptr, false, false); NKikimrExternalSources::TSchema schema; { // location diff --git a/ydb/core/fq/libs/actors/run_actor.cpp b/ydb/core/fq/libs/actors/run_actor.cpp index f52beadd6735..18e731e87d88 100644 --- a/ydb/core/fq/libs/actors/run_actor.cpp +++ b/ydb/core/fq/libs/actors/run_actor.cpp @@ -1940,7 +1940,7 @@ class TRunActor : public NActors::TActorBootstrapped { { dataProvidersInit.push_back(GetS3DataProviderInitializer(Params.S3Gateway, Params.CredentialsFactory, - Params.Config.GetReadActorsFactoryConfig().HasS3ReadActorFactoryConfig() ? Params.Config.GetReadActorsFactoryConfig().GetS3ReadActorFactoryConfig().GetAllowLocalFiles() : Params.Config.GetGateways().GetS3().GetAllowLocalFiles())); // This part is for backward compatibility. TODO: remove this part after migration to TS3GatewayConfig + Params.Config.GetReadActorsFactoryConfig().HasS3ReadActorFactoryConfig() ? Params.Config.GetReadActorsFactoryConfig().GetS3ReadActorFactoryConfig().GetAllowLocalFiles() : Params.Config.GetGateways().GetS3().GetAllowLocalFiles(), NActors::TActivationContext::ActorSystem())); // This part is for backward compatibility. TODO: remove this part after migration to TS3GatewayConfig } { diff --git a/ydb/core/fq/libs/init/init.cpp b/ydb/core/fq/libs/init/init.cpp index 46d9042f875b..19d745627364 100644 --- a/ydb/core/fq/libs/init/init.cpp +++ b/ydb/core/fq/libs/init/init.cpp @@ -216,7 +216,7 @@ void Init( RegisterYdbReadActorFactory(*asyncIoFactory, yqSharedResources->UserSpaceYdbDriver, credentialsFactory); s3ActorsFactory->RegisterS3ReadActorFactory(*asyncIoFactory, credentialsFactory, httpGateway, s3HttpRetryPolicy, readActorFactoryCfg, - yqCounters->GetSubgroup("subsystem", "S3ReadActor")); + yqCounters->GetSubgroup("subsystem", "S3ReadActor"), protoConfig.GetGateways().GetS3().GetAllowLocalFiles()); s3ActorsFactory->RegisterS3WriteActorFactory(*asyncIoFactory, credentialsFactory, httpGateway, s3HttpRetryPolicy); diff --git a/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp b/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp index f4528f079ed1..cc4502ca840c 100644 --- a/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp +++ b/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp @@ -78,7 +78,7 @@ NYql::NDq::IDqAsyncIoFactory::TPtr CreateKqpAsyncIoFactory( if (federatedQuerySetup) { auto s3HttpRetryPolicy = NYql::GetHTTPDefaultRetryPolicy(NYql::THttpRetryPolicyOptions{.RetriedCurlCodes = NYql::FqRetriedCurlCodes()}); - s3ActorsFactory->RegisterS3ReadActorFactory(*factory, federatedQuerySetup->CredentialsFactory, federatedQuerySetup->HttpGateway, s3HttpRetryPolicy, federatedQuerySetup->S3ReadActorFactoryConfig); + s3ActorsFactory->RegisterS3ReadActorFactory(*factory, federatedQuerySetup->CredentialsFactory, federatedQuerySetup->HttpGateway, s3HttpRetryPolicy, federatedQuerySetup->S3ReadActorFactoryConfig, nullptr, federatedQuerySetup->S3GatewayConfig.GetAllowLocalFiles()); s3ActorsFactory->RegisterS3WriteActorFactory(*factory, federatedQuerySetup->CredentialsFactory, federatedQuerySetup->HttpGateway, s3HttpRetryPolicy); if (federatedQuerySetup->ConnectorClient) { diff --git a/ydb/core/kqp/host/kqp_host.cpp b/ydb/core/kqp/host/kqp_host.cpp index 972689af9f48..075cf73642f9 100644 --- a/ydb/core/kqp/host/kqp_host.cpp +++ b/ydb/core/kqp/host/kqp_host.cpp @@ -1072,7 +1072,8 @@ class TKqpHost : public IKqpHost { ActorSystem, FederatedQuerySetup->S3GatewayConfig.GetGeneratorPathsLimit(), FederatedQuerySetup ? FederatedQuerySetup->CredentialsFactory : nullptr, - Config->FeatureFlags.GetEnableExternalSourceSchemaInference()); + Config->FeatureFlags.GetEnableExternalSourceSchemaInference(), + FederatedQuerySetup->S3GatewayConfig.GetAllowLocalFiles()); } } @@ -1698,6 +1699,7 @@ class TKqpHost : public IKqpHost { state->Gateway = FederatedQuerySetup->HttpGateway; state->GatewayRetryPolicy = NYql::GetHTTPDefaultRetryPolicy(NYql::THttpRetryPolicyOptions{.RetriedCurlCodes = NYql::FqRetriedCurlCodes()}); state->ExecutorPoolId = AppData()->UserPoolId; + state->ActorSystem = ActorSystem; auto dataSource = NYql::CreateS3DataSource(state); auto dataSink = NYql::CreateS3DataSink(state); diff --git a/ydb/core/tx/schemeshard/schemeshard_impl.cpp b/ydb/core/tx/schemeshard/schemeshard_impl.cpp index 05098b99d693..69a66616bdce 100644 --- a/ydb/core/tx/schemeshard/schemeshard_impl.cpp +++ b/ydb/core/tx/schemeshard/schemeshard_impl.cpp @@ -6966,7 +6966,10 @@ void TSchemeShard::ApplyConsoleConfigs(const NKikimrConfig::TAppConfig& appConfi ExternalSourceFactory = NExternalSource::CreateExternalSourceFactory( std::vector(hostnamePatterns.begin(), hostnamePatterns.end()), nullptr, - appConfig.GetQueryServiceConfig().GetS3().GetGeneratorPathsLimit() + appConfig.GetQueryServiceConfig().GetS3().GetGeneratorPathsLimit(), + nullptr, + appConfig.GetFeatureFlags().GetEnableExternalSourceSchemaInference(), + appConfig.GetQueryServiceConfig().GetS3().GetAllowLocalFiles() ); } diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_actors_factory_impl.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_actors_factory_impl.cpp index 7affd433e1a8..202cbebf3d21 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_actors_factory_impl.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_actors_factory_impl.cpp @@ -64,16 +64,17 @@ namespace NYql::NDq { IHTTPGateway::TPtr gateway, const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, const TS3ReadActorFactoryConfig& cfg, - ::NMonitoring::TDynamicCounterPtr counters) override { + ::NMonitoring::TDynamicCounterPtr counters, + bool allowLocalFiles) override { #if defined(_linux_) || defined(_darwin_) NDB::registerFormats(); factory.RegisterSource("S3Source", - [credentialsFactory, gateway, retryPolicy, cfg, counters](NS3::TSource&& settings, IDqAsyncIoFactory::TSourceArguments&& args) { + [credentialsFactory, gateway, retryPolicy, cfg, counters, allowLocalFiles](NS3::TSource&& settings, IDqAsyncIoFactory::TSourceArguments&& args) { return CreateS3ReadActor(args.TypeEnv, args.HolderFactory, gateway, std::move(settings), args.InputIndex, args.StatsLevel, args.TxId, args.SecureParams, args.TaskParams, args.ReadRanges, args.ComputeActorId, credentialsFactory, retryPolicy, cfg, - counters, args.TaskCounters, args.MemoryQuotaManager); + counters, args.TaskCounters, args.MemoryQuotaManager, allowLocalFiles); }); #else Y_UNUSED(factory); diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp index 29381cdad996..50fdf1894ef8 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp @@ -59,7 +59,8 @@ class TS3ReadActor : public NActors::TActorBootstrapped, public ID NActors::TActorId fileQueueActor, ui64 fileQueueBatchSizeLimit, ui64 fileQueueBatchObjectCountLimit, - ui64 fileQueueConsumersCountDelta) + ui64 fileQueueConsumersCountDelta, + bool allowLocalFiles) : ReadActorFactoryCfg(readActorFactoryCfg) , Gateway(std::move(gateway)) , HolderFactory(holderFactory) @@ -76,6 +77,7 @@ class TS3ReadActor : public NActors::TActorBootstrapped, public ID , FileQueueActor(fileQueueActor) , AddPathIndex(addPathIndex) , SizeLimit(sizeLimit) + , AllowLocalFiles(allowLocalFiles) , Counters(counters) , TaskCounters(taskCounters) , FileSizeLimit(fileSizeLimit) @@ -116,7 +118,8 @@ class TS3ReadActor : public NActors::TActorBootstrapped, public ID Credentials, Pattern, PatternVariant, - NYql::NS3Lister::ES3PatternType::Wildcard)); + NYql::NS3Lister::ES3PatternType::Wildcard, + AllowLocalFiles)); } LOG_D("TS3ReadActor", "Bootstrap" << ", InputIndex: " << InputIndex << ", FileQueue: " << FileQueueActor << (UseRuntimeListing ? " (remote)" : " (local")); @@ -467,6 +470,7 @@ class TS3ReadActor : public NActors::TActorBootstrapped, public ID const bool AddPathIndex; const ui64 SizeLimit; TDuration CpuTime; + const bool AllowLocalFiles; std::queue> Blocks; @@ -521,7 +525,8 @@ std::pair CreateRawRead NActors::TActorId fileQueueActor, ui64 fileQueueBatchSizeLimit, ui64 fileQueueBatchObjectCountLimit, - ui64 fileQueueConsumersCountDelta + ui64 fileQueueConsumersCountDelta, + bool allowLocalFiles ) { const auto actor = new TS3ReadActor( inputIndex, @@ -547,7 +552,8 @@ std::pair CreateRawRead fileQueueActor, fileQueueBatchSizeLimit, fileQueueBatchObjectCountLimit, - fileQueueConsumersCountDelta + fileQueueConsumersCountDelta, + allowLocalFiles ); return {actor, actor}; diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.h b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.h index 102ea19c94c3..791e19502870 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.h +++ b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.h @@ -36,7 +36,8 @@ std::pair CreateRawRead NActors::TActorId fileQueueActor, ui64 fileQueueBatchSizeLimit, ui64 fileQueueBatchObjectCountLimit, - ui64 fileQueueConsumersCountDelta + ui64 fileQueueConsumersCountDelta, + bool allowLocalFiles ); } // namespace NYql::NDq diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp index c8dc7c4cb7ea..165b61ba6ab4 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp @@ -1292,7 +1292,8 @@ class TS3StreamReadActor : public TActorBootstrapped, public ui64 fileQueueBatchObjectCountLimit, ui64 fileQueueConsumersCountDelta, bool asyncDecoding, - bool asyncDecompressing + bool asyncDecompressing, + bool allowLocalFiles ) : ReadActorFactoryCfg(readActorFactoryCfg) , Gateway(std::move(gateway)) , HolderFactory(holderFactory) @@ -1319,7 +1320,8 @@ class TS3StreamReadActor : public TActorBootstrapped, public , FileQueueBatchObjectCountLimit(fileQueueBatchObjectCountLimit) , FileQueueConsumersCountDelta(fileQueueConsumersCountDelta) , AsyncDecoding(asyncDecoding) - , AsyncDecompressing(asyncDecompressing) { + , AsyncDecompressing(asyncDecompressing) + , AllowLocalFiles(allowLocalFiles) { if (Counters) { QueueDataSize = Counters->GetCounter("QueueDataSize"); QueueDataLimit = Counters->GetCounter("QueueDataLimit"); @@ -1396,7 +1398,8 @@ class TS3StreamReadActor : public TActorBootstrapped, public Credentials, Pattern, PatternVariant, - ES3PatternType::Wildcard)); + ES3PatternType::Wildcard, + AllowLocalFiles)); } FileQueueEvents.Init(TxId, SelfId(), SelfId()); FileQueueEvents.OnNewRecipientId(FileQueueActor); @@ -1904,6 +1907,7 @@ class TS3StreamReadActor : public TActorBootstrapped, public ui64 FileQueueConsumersCountDelta; const bool AsyncDecoding; const bool AsyncDecompressing; + const bool AllowLocalFiles; bool IsCurrentBatchEmpty = false; bool IsFileQueueEmpty = false; bool IsWaitingFileQueueResponse = false; @@ -2067,7 +2071,8 @@ std::pair CreateS3ReadActor( const TS3ReadActorFactoryConfig& cfg, ::NMonitoring::TDynamicCounterPtr counters, ::NMonitoring::TDynamicCounterPtr taskCounters, - IMemoryQuotaManager::TPtr memoryQuotaManager) + IMemoryQuotaManager::TPtr memoryQuotaManager, + bool allowLocalFiles) { const IFunctionRegistry& functionRegistry = *holderFactory.GetFunctionRegistry(); @@ -2257,7 +2262,7 @@ std::pair CreateS3ReadActor( std::move(paths), addPathIndex, readSpec, computeActorId, retryPolicy, cfg, counters, taskCounters, fileSizeLimit, sizeLimit, rowsLimitHint, memoryQuotaManager, params.GetUseRuntimeListing(), fileQueueActor, fileQueueBatchSizeLimit, fileQueueBatchObjectCountLimit, fileQueueConsumersCountDelta, - params.GetAsyncDecoding(), params.GetAsyncDecompressing()); + params.GetAsyncDecoding(), params.GetAsyncDecompressing(), allowLocalFiles); return {actor, actor}; } else { @@ -2268,7 +2273,7 @@ std::pair CreateS3ReadActor( return CreateRawReadActor(inputIndex, statsLevel, txId, std::move(gateway), holderFactory, params.GetUrl(), credentials, pathPattern, pathPatternVariant, std::move(paths), addPathIndex, computeActorId, sizeLimit, retryPolicy, cfg, counters, taskCounters, fileSizeLimit, rowsLimitHint, - params.GetUseRuntimeListing(), fileQueueActor, fileQueueBatchSizeLimit, fileQueueBatchObjectCountLimit, fileQueueConsumersCountDelta); + params.GetUseRuntimeListing(), fileQueueActor, fileQueueBatchSizeLimit, fileQueueBatchObjectCountLimit, fileQueueConsumersCountDelta, allowLocalFiles); } } diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h index 5de66acf6f1f..49e46f81f4e6 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.h @@ -29,7 +29,8 @@ NActors::IActor* CreateS3FileQueueActor( const TS3Credentials& credentials, TString pattern, NYql::NS3Lister::ES3PatternVariant patternVariant, - NS3Lister::ES3PatternType patternType); + NS3Lister::ES3PatternType patternType, + bool allowLocalFiles); std::pair CreateS3ReadActor( const NKikimr::NMiniKQL::TTypeEnvironment& typeEnv, @@ -48,6 +49,7 @@ std::pair CreateS3ReadA const TS3ReadActorFactoryConfig& cfg, ::NMonitoring::TDynamicCounterPtr counters, ::NMonitoring::TDynamicCounterPtr taskCounters, - IMemoryQuotaManager::TPtr memoryQuotaManager); + IMemoryQuotaManager::TPtr memoryQuotaManager, + bool allowLocalFiles); } // namespace NYql::NDq diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_source_factory.h b/ydb/library/yql/providers/s3/actors/yql_s3_source_factory.h index 5b8961161d5d..e68d893a645d 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_source_factory.h +++ b/ydb/library/yql/providers/s3/actors/yql_s3_source_factory.h @@ -10,9 +10,10 @@ void RegisterS3ReadActorFactory( IHTTPGateway::TPtr gateway = IHTTPGateway::Make(), const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy = GetHTTPDefaultRetryPolicy(), const TS3ReadActorFactoryConfig& factoryConfig = {}, - ::NMonitoring::TDynamicCounterPtr counters = nullptr) { + ::NMonitoring::TDynamicCounterPtr counters = nullptr, + bool allowLocalFiles = false) { CreateS3ActorsFactory()->RegisterS3ReadActorFactory( - factory, credentialsFactory, gateway, retryPolicy, factoryConfig, counters + factory, credentialsFactory, gateway, retryPolicy, factoryConfig, counters, allowLocalFiles ); } diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp index 060afbb4aea5..918953ad5b8d 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp @@ -176,7 +176,8 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped const TS3Credentials& credentials, TString pattern, NS3Lister::ES3PatternVariant patternVariant, - NS3Lister::ES3PatternType patternType) + NS3Lister::ES3PatternType patternType, + bool allowLocalFiles) : TxId(std::move(txId)) , PrefetchSize(prefetchSize) , FileSizeLimit(fileSizeLimit) @@ -192,7 +193,8 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped , Credentials(credentials) , Pattern(std::move(pattern)) , PatternVariant(patternVariant) - , PatternType(patternType) { + , PatternType(patternType) + , AllowLocalFiles(allowLocalFiles) { for (size_t i = 0; i < paths.size(); ++i) { NS3::FileQueue::TObjectPath object; object.SetPath(paths[i].Path); @@ -500,7 +502,8 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped PatternType, object.GetPath()}, Nothing(), - false); + AllowLocalFiles, + NActors::TActivationContext::ActorSystem()); Fetch(); return true; } @@ -620,6 +623,7 @@ class TS3FileQueueActor : public NActors::TActorBootstrapped const TString Pattern; const NS3Lister::ES3PatternVariant PatternVariant; const NS3Lister::ES3PatternType PatternType; + const bool AllowLocalFiles; static constexpr TDuration PoisonTimeout = TDuration::Hours(3); static constexpr TDuration RoundRobinStageTimeout = TDuration::Seconds(3); @@ -641,7 +645,8 @@ NActors::IActor* CreateS3FileQueueActor( const TS3Credentials& credentials, TString pattern, NS3Lister::ES3PatternVariant patternVariant, - NS3Lister::ES3PatternType patternType) { + NS3Lister::ES3PatternType patternType, + bool allowLocalFiles) { return new TS3FileQueueActor( txId, paths, @@ -658,7 +663,8 @@ NActors::IActor* CreateS3FileQueueActor( credentials, pattern, patternVariant, - patternType + patternType, + allowLocalFiles ); } diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h index 86fd9aa1d385..33686d5d88fe 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h +++ b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.h @@ -27,6 +27,7 @@ NActors::IActor* CreateS3FileQueueActor( const TS3Credentials& credentials, TString pattern, NYql::NS3Lister::ES3PatternVariant patternVariant, - NS3Lister::ES3PatternType patternType); + NS3Lister::ES3PatternType patternType, + bool allowLocalFiles); } // namespace NYql::NDq diff --git a/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.cpp b/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.cpp index 3b14babb1dc3..389ddb3ec875 100644 --- a/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.cpp +++ b/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.cpp @@ -45,7 +45,8 @@ namespace NYql::NDq { IHTTPGateway::TPtr gateway, const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, const TS3ReadActorFactoryConfig& factoryConfig = {}, - ::NMonitoring::TDynamicCounterPtr counters = nullptr) override { + ::NMonitoring::TDynamicCounterPtr counters = nullptr, + bool allowLocalFiles = false) override { Y_UNUSED(factory); Y_UNUSED(credentialsFactory); @@ -53,6 +54,7 @@ namespace NYql::NDq { Y_UNUSED(retryPolicy); Y_UNUSED(factoryConfig); Y_UNUSED(counters); + Y_UNUSED(allowLocalFiles); } }; diff --git a/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.h b/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.h index a7c55e50348b..84f92fa66548 100644 --- a/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.h +++ b/ydb/library/yql/providers/s3/actors_factory/yql_s3_actors_factory.h @@ -48,7 +48,8 @@ namespace NYql::NDq { IHTTPGateway::TPtr gateway, const IHTTPGateway::TRetryPolicy::TPtr& retryPolicy, const TS3ReadActorFactoryConfig& cfg = {}, - ::NMonitoring::TDynamicCounterPtr counters = nullptr) = 0; + ::NMonitoring::TDynamicCounterPtr counters = nullptr, + bool allowLocalFiles = false) = 0; }; std::shared_ptr CreateDefaultS3ActorsFactory(); diff --git a/ydb/library/yql/providers/s3/object_listers/ya.make b/ydb/library/yql/providers/s3/object_listers/ya.make index 2d284a9b4d9a..62849a03a7de 100644 --- a/ydb/library/yql/providers/s3/object_listers/ya.make +++ b/ydb/library/yql/providers/s3/object_listers/ya.make @@ -14,6 +14,7 @@ PEERDIR( ydb/library/yql/providers/common/http_gateway ydb/library/yql/providers/s3/credentials ydb/library/yql/utils + ydb/library/yql/utils/actor_log ydb/library/yql/utils/threading ) diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp index f62c12ea1d4c..5a764a7782ec 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -25,7 +26,7 @@ namespace NYql::NS3Lister { IOutputStream& operator<<(IOutputStream& stream, const TListingRequest& request) { - return stream << "TListingRequest{.url=" << request.Url + return stream << "[TS3Lister] TListingRequest{.url=" << request.Url << ",.Prefix=" << request.Prefix << ",.Pattern=" << request.Pattern << ",.PatternType=" << request.PatternType @@ -51,7 +52,7 @@ std::pair MakeFilterRegexp(const TString& regex, const size_t numGroups = re->NumberOfCapturingGroups(); YQL_CLOG(DEBUG, ProviderS3) - << "Got regex: '" << regex << "' with " << numGroups << " capture groups "; + << "[TS3Lister] Got regex: '" << regex << "' with " << numGroups << " capture groups "; auto groups = std::make_shared>(numGroups); auto reArgs = std::make_shared>(numGroups); @@ -101,7 +102,7 @@ std::pair MakeFilterWildcard(const TString& patt } const auto regex = NS3::RegexFromWildcards(pattern); - YQL_CLOG(DEBUG, ProviderS3) << "Got prefix: '" << regexPatternPrefix << "', regex: '" + YQL_CLOG(DEBUG, ProviderS3) << "[TS3Lister] Got prefix: '" << regexPatternPrefix << "', regex: '" << regex << "' from original pattern '" << pattern << "'"; return MakeFilterRegexp(regex, sharedCtx); @@ -238,6 +239,8 @@ class TS3Lister : public IS3Lister { const TMaybe Delimiter; const TMaybe ContinuationToken; const ui64 MaxKeys; + const std::pair CurrentLogContextPath; + const NActors::TActorSystem* ActorSystem; }; TS3Lister( @@ -246,7 +249,8 @@ class TS3Lister : public IS3Lister { const TListingRequest& listingRequest, const TMaybe& delimiter, size_t maxFilesPerQuery, - TSharedListingContextPtr sharedCtx) + TSharedListingContextPtr sharedCtx, + NActors::TActorSystem* actorSystem) : MaxFilesPerQuery(maxFilesPerQuery) { Y_ENSURE( listingRequest.Url.substr(0, 7) != "file://", @@ -270,7 +274,9 @@ class TS3Lister : public IS3Lister { std::move(request), delimiter, Nothing(), - MaxFilesPerQuery}; + MaxFilesPerQuery, + NLog::CurrentLogContextPath(), + actorSystem}; YQL_CLOG(TRACE, ProviderS3) << "[TS3Lister] Got URL: '" << ctx.ListingRequest.Url @@ -335,9 +341,19 @@ class TS3Lister : public IS3Lister { /*data=*/"", retryPolicy); } + static IHTTPGateway::TOnResult CallbackFactoryMethod(TListingContext&& listingContext) { return [c = std::move(listingContext)](IHTTPGateway::TResult&& result) { - OnDiscovery(c, std::move(result)); + if (c.ActorSystem) { + NDq::TYqlLogScope logScope(c.ActorSystem, NKikimrServices::KQP_YQL, c.CurrentLogContextPath.first, c.CurrentLogContextPath.second); + OnDiscovery(c, std::move(result)); + } else { + /* + If the subsystem doesn't use the actor system + then there is a need to use an own YqlLoggerScope on the top level + */ + OnDiscovery(c, std::move(result)); + } }; } @@ -351,7 +367,7 @@ class TS3Lister : public IS3Lister { const NXml::TDocument xml(xmlString, NXml::TDocument::String); auto parsedResponse = ParseListObjectV2Response(xml, ctx.RequestId); YQL_CLOG(DEBUG, ProviderS3) - << "Listing of " << ctx.ListingRequest.Url + << "[TS3Lister] Listing of " << ctx.ListingRequest.Url << ctx.ListingRequest.Prefix << ": have " << ctx.Output->Size() << " entries, got another " << parsedResponse.KeyCount << " entries, request id: [" << ctx.RequestId << "]"; @@ -380,7 +396,7 @@ class TS3Lister : public IS3Lister { } if (parsedResponse.IsTruncated && !earlyStop) { - YQL_CLOG(DEBUG, ProviderS3) << "Listing of " << ctx.ListingRequest.Url + YQL_CLOG(DEBUG, ProviderS3) << "[TS3Lister] Listing of " << ctx.ListingRequest.Url << ctx.ListingRequest.Prefix << ": got truncated flag, will continue"; @@ -409,14 +425,14 @@ class TS3Lister : public IS3Lister { TStringBuilder{} << "request id: [" << ctx.RequestId << "]", std::move(result.Issues)); YQL_CLOG(INFO, ProviderS3) - << "Listing of " << ctx.ListingRequest.Url << ctx.ListingRequest.Prefix + << "[TS3Lister] Listing of " << ctx.ListingRequest.Url << ctx.ListingRequest.Prefix << ": got error from http gateway: " << issues.ToString(true); ctx.Promise.SetValue(TListError{EListError::GENERAL, std::move(issues)}); ctx.NextRequestPromise.SetValue(Nothing()); } } catch (const std::exception& ex) { YQL_CLOG(INFO, ProviderS3) - << "Listing of " << ctx.ListingRequest.Url << ctx.ListingRequest.Prefix + << "[TS3Lister] Listing of " << ctx.ListingRequest.Url << ctx.ListingRequest.Prefix << " : got exception: " << ex.what(); ctx.Promise.SetException(std::current_exception()); ctx.NextRequestPromise.SetValue(Nothing()); @@ -452,9 +468,10 @@ class TS3ParallelLimitedListerFactory : public IS3ListerFactory { using TPtr = std::shared_ptr; explicit TS3ParallelLimitedListerFactory( - size_t maxParallelOps, TSharedListingContextPtr sharedCtx) + size_t maxParallelOps, TSharedListingContextPtr sharedCtx, NActors::TActorSystem* actorSystem) : SharedCtx(std::move(sharedCtx)) - , Semaphore(TAsyncSemaphore::Make(std::max(1, maxParallelOps))) { } + , Semaphore(TAsyncSemaphore::Make(std::max(1, maxParallelOps))) + , ActorSystem(actorSystem) { } TFuture Make( const IHTTPGateway::TPtr& httpGateway, @@ -464,10 +481,10 @@ class TS3ParallelLimitedListerFactory : public IS3ListerFactory { bool allowLocalFiles) override { auto acquired = Semaphore->AcquireAsync(); return acquired.Apply( - [ctx = SharedCtx, httpGateway, retryPolicy, listingRequest, delimiter, allowLocalFiles](const auto& f) { + [ctx = SharedCtx, httpGateway, retryPolicy, listingRequest, delimiter, allowLocalFiles, actorSystem = ActorSystem](const auto& f) { return std::shared_ptr(new TListerLockReleaseWrapper{ NS3Lister::MakeS3Lister( - httpGateway, retryPolicy, listingRequest, delimiter, allowLocalFiles, ctx), + httpGateway, retryPolicy, listingRequest, delimiter, allowLocalFiles, actorSystem, ctx), std::make_unique( f.GetValue()->MakeAutoRelease())}); }); @@ -503,6 +520,7 @@ class TS3ParallelLimitedListerFactory : public IS3ListerFactory { private: TSharedListingContextPtr SharedCtx; const TAsyncSemaphore::TPtr Semaphore; + NActors::TActorSystem* ActorSystem; }; } // namespace @@ -513,10 +531,11 @@ IS3Lister::TPtr MakeS3Lister( const TListingRequest& listingRequest, const TMaybe& delimiter, bool allowLocalFiles, + NActors::TActorSystem* actorSystem, TSharedListingContextPtr sharedCtx) { if (listingRequest.Url.substr(0, 7) != "file://") { return std::make_shared( - httpGateway, retryPolicy, listingRequest, delimiter, 1000, std::move(sharedCtx)); + httpGateway, retryPolicy, listingRequest, delimiter, 1000, std::move(sharedCtx), actorSystem); } if (!allowLocalFiles) { @@ -530,13 +549,14 @@ IS3ListerFactory::TPtr MakeS3ListerFactory( size_t maxParallelOps, size_t callbackThreadCount, size_t callbackPerThreadQueueSize, - size_t regexpCacheSize) { + size_t regexpCacheSize, + NActors::TActorSystem* actorSystem) { std::shared_ptr sharedCtx = nullptr; if (callbackThreadCount != 0 || regexpCacheSize != 0) { sharedCtx = std::make_shared( callbackThreadCount, callbackPerThreadQueueSize, regexpCacheSize); } - return std::make_shared(maxParallelOps, sharedCtx); + return std::make_shared(maxParallelOps, sharedCtx, actorSystem); } } // namespace NYql::NS3Lister diff --git a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h index 93fafae19057..3419ec3fd462 100644 --- a/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h +++ b/ydb/library/yql/providers/s3/object_listers/yql_s3_list.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -169,6 +170,7 @@ IS3Lister::TPtr MakeS3Lister( const TListingRequest& listingRequest, const TMaybe& delimiter, bool allowLocalFiles, + NActors::TActorSystem* actorSystem, TSharedListingContextPtr sharedCtx = nullptr); class IS3ListerFactory { @@ -189,7 +191,8 @@ IS3ListerFactory::TPtr MakeS3ListerFactory( size_t maxParallelOps, size_t callbackThreadCount, size_t callbackPerThreadQueueSize, - size_t regexpCacheSize); + size_t regexpCacheSize, + NActors::TActorSystem* actorSystem); } // namespace NS3Lister } // namespace NYql diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp index 7a16fc818aa4..3d5d7adc3179 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp @@ -546,7 +546,8 @@ class TS3DqIntegration: public TDqIntegrationBase { TS3Credentials(State_->CredentialsFactory, State_->Configuration->Tokens.at(cluster)), pathPattern, pathPatternVariant, - NS3Lister::ES3PatternType::Wildcard + NS3Lister::ES3PatternType::Wildcard, + State_->Configuration->AllowLocalFiles ), NActors::TMailboxType::HTSwap, State_->ExecutorPoolId diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp index 436a116ecdf1..e78d47a00f2b 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_io_discovery.cpp @@ -83,7 +83,8 @@ class TS3IODiscoveryTransformer : public TGraphTransformerBase { State_->Configuration->MaxInflightListsPerQuery, State_->Configuration->ListingCallbackThreadCount, State_->Configuration->ListingCallbackPerThreadQueueSize, - State_->Configuration->RegexpCacheSize)) + State_->Configuration->RegexpCacheSize, + State_->ActorSystem)) , ListingStrategy_(MakeS3ListingStrategy( State_->Gateway, State_->GatewayRetryPolicy, diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp index 843ba0bcf434..a7d52c408905 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp @@ -491,8 +491,14 @@ class TBFSDirectoryResolverIterator : public IS3Lister { }); return NextDirectoryListeningChunk; } + + static TString ParseBasePath(const TString& path) { + TString basePath = TString{TStringBuf{path}.RBefore('/')}; + return basePath == path && !basePath.EndsWith('/') ? TString{} : basePath; + } + void PerformEarlyStop(TListEntries& result, const TString& sourcePrefix) { - result.Directories.push_back({.Path = sourcePrefix}); + result.Directories.push_back({.Path = ParseBasePath(sourcePrefix)}); for (auto& directoryPrefix : DirectoryPrefixQueue) { result.Directories.push_back({.Path = directoryPrefix}); } diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_provider.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_provider.cpp index 85707a21f16a..c283c53e8cab 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_provider.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_provider.cpp @@ -4,8 +4,8 @@ namespace NYql { -TDataProviderInitializer GetS3DataProviderInitializer(IHTTPGateway::TPtr gateway, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, bool allowLocalFiles) { - return [gateway, credentialsFactory, allowLocalFiles] ( +TDataProviderInitializer GetS3DataProviderInitializer(IHTTPGateway::TPtr gateway, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, bool allowLocalFiles, NActors::TActorSystem* actorSystem) { + return [gateway, credentialsFactory, allowLocalFiles, actorSystem] ( const TString& userName, const TString& sessionId, const TGatewaysConfig* gatewaysConfig, @@ -31,6 +31,7 @@ TDataProviderInitializer GetS3DataProviderInitializer(IHTTPGateway::TPtr gateway state->Types = typeCtx.Get(); state->FunctionRegistry = functionRegistry; state->CredentialsFactory = credentialsFactory; + state->ActorSystem = actorSystem; if (gatewaysConfig) { state->Configuration->Init(gatewaysConfig->GetS3(), typeCtx); } diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_provider.h b/ydb/library/yql/providers/s3/provider/yql_s3_provider.h index 0bcf96290c7a..f5eaa96630c4 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_provider.h +++ b/ydb/library/yql/providers/s3/provider/yql_s3_provider.h @@ -32,9 +32,10 @@ struct TS3State : public TThrRefBase IHTTPGateway::TRetryPolicy::TPtr GatewayRetryPolicy = GetHTTPDefaultRetryPolicy(); ui32 ExecutorPoolId = 0; std::list> PrimaryKeys; + NActors::TActorSystem* ActorSystem = nullptr; }; -TDataProviderInitializer GetS3DataProviderInitializer(IHTTPGateway::TPtr gateway, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory = nullptr, bool allowLocalFiles = false); +TDataProviderInitializer GetS3DataProviderInitializer(IHTTPGateway::TPtr gateway, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory = nullptr, bool allowLocalFiles = false, NActors::TActorSystem* actorSystem = nullptr); TIntrusivePtr CreateS3DataSource(TS3State::TPtr state); TIntrusivePtr CreateS3DataSink(TS3State::TPtr state); diff --git a/ydb/library/yql/tools/dqrun/dqrun.cpp b/ydb/library/yql/tools/dqrun/dqrun.cpp index d9d5f081a647..6d0f621f6e6c 100644 --- a/ydb/library/yql/tools/dqrun/dqrun.cpp +++ b/ydb/library/yql/tools/dqrun/dqrun.cpp @@ -909,7 +909,7 @@ int RunMain(int argc, const char* argv[]) if (!httpGateway) { httpGateway = IHTTPGateway::Make(gatewaysConfig.HasHttpGateway() ? &gatewaysConfig.GetHttpGateway() : nullptr); } - dataProvidersInit.push_back(GetS3DataProviderInitializer(httpGateway, nullptr, true)); + dataProvidersInit.push_back(GetS3DataProviderInitializer(httpGateway, nullptr, true, actorSystemManager->GetActorSystem())); } if (gatewaysConfig.HasPq()) { diff --git a/ydb/tests/fq/s3/test_s3_1.py b/ydb/tests/fq/s3/test_s3_1.py index 2a117867c7a5..0f4260bdf4cc 100644 --- a/ydb/tests/fq/s3/test_s3_1.py +++ b/ydb/tests/fq/s3/test_s3_1.py @@ -481,3 +481,79 @@ def test_huge_source(self, kikimr, s3, client, runtime_listing, unique_prefix): assert result_set.rows[0].items[0].uint64_value == 1024 * 10 # 1024 x 1024 x 10 = 10 MB of raw data + little overhead for header, eols etc assert sum(kikimr.control_plane.get_metering(1)) == 21 + + # it looks like the runtime_listing for v1 doesn't work in case of + # restart of query because the v1 keeps the compiled query in the cache + @yq_all + @pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True) + @pytest.mark.parametrize("runtime_listing", ["false", "true"]) + def test_top_level_listing(self, kikimr, s3, client, runtime_listing, unique_prefix): + resource = boto3.resource( + "s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key" + ) + + bucket = resource.Bucket("fbucket") + bucket.create(ACL='public-read') + bucket.objects.all().delete() + + s3_client = boto3.client( + "s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key" + ) + + fruits = '''Fruit,Price,Weight +Banana,3,100 +Apple,2,22 +Pear,15,33''' + s3_client.put_object(Body=fruits, Bucket='fbucket', Key='2024-08-09.csv', ContentType='text/plain') + s3_client.put_object(Body=fruits, Bucket='fbucket', Key='2024-08-08.csv', ContentType='text/plain') + + kikimr.control_plane.wait_bootstrap(1) + storage_connection_name = unique_prefix + "test_top_level_listing" + client.create_storage_connection(storage_connection_name, "fbucket") + + sql = f''' + pragma s3.UseRuntimeListing="{runtime_listing}"; + + SELECT * + FROM `{storage_connection_name}`.`/2024-08-*` + WITH (format=csv_with_names, SCHEMA ( + Fruit String NOT NULL, + Price Int NOT NULL, + Weight Int NOT NULL + ) + ); + ''' + + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.COMPLETED) + + data = client.get_result_data(query_id) + result_set = data.result.result_set + logging.debug(str(result_set)) + assert len(result_set.columns) == 3 + assert result_set.columns[0].name == "Fruit" + assert result_set.columns[0].type.type_id == ydb.Type.STRING + assert result_set.columns[1].name == "Price" + assert result_set.columns[1].type.type_id == ydb.Type.INT32 + assert result_set.columns[2].name == "Weight" + assert result_set.columns[2].type.type_id == ydb.Type.INT32 + assert len(result_set.rows) == 6 + assert result_set.rows[0].items[0].bytes_value == b"Banana" + assert result_set.rows[0].items[1].int32_value == 3 + assert result_set.rows[0].items[2].int32_value == 100 + assert result_set.rows[1].items[0].bytes_value == b"Apple" + assert result_set.rows[1].items[1].int32_value == 2 + assert result_set.rows[1].items[2].int32_value == 22 + assert result_set.rows[2].items[0].bytes_value == b"Pear" + assert result_set.rows[2].items[1].int32_value == 15 + assert result_set.rows[2].items[2].int32_value == 33 + assert result_set.rows[3].items[0].bytes_value == b"Banana" + assert result_set.rows[3].items[1].int32_value == 3 + assert result_set.rows[3].items[2].int32_value == 100 + assert result_set.rows[4].items[0].bytes_value == b"Apple" + assert result_set.rows[4].items[1].int32_value == 2 + assert result_set.rows[4].items[2].int32_value == 22 + assert result_set.rows[5].items[0].bytes_value == b"Pear" + assert result_set.rows[5].items[1].int32_value == 15 + assert result_set.rows[5].items[2].int32_value == 33 + assert sum(kikimr.control_plane.get_metering(1)) == 10 diff --git a/ydb/tests/tools/fq_runner/kikimr_runner.py b/ydb/tests/tools/fq_runner/kikimr_runner.py index f1ed9a18d6d9..3161591f2c78 100644 --- a/ydb/tests/tools/fq_runner/kikimr_runner.py +++ b/ydb/tests/tools/fq_runner/kikimr_runner.py @@ -484,7 +484,7 @@ def fill_config(self, control_plane): self.config_generator.yaml_config['grpc_config']['skip_scheme_check'] = True self.config_generator.yaml_config['grpc_config']['services'] = ["local_discovery", "yq", "yq_private"] # yq services - fq_config['control_plane_storage']['task_lease_ttl'] = "10s" + fq_config['control_plane_storage']['task_lease_ttl'] = "20s" self.fill_storage_config(fq_config['control_plane_storage']['storage'], "DbPoolStorage_" + self.uuid) else: self.config_generator.yaml_config.pop('grpc_config', None) From f0a53890604f2530f1a43bf50585f08418231c66 Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Wed, 28 Aug 2024 21:36:13 +0300 Subject: [PATCH 20/56] scheme connection has been supported (#8232) (#8384) --- ydb/core/fq/libs/compute/common/config.h | 28 ++++++++++++++++++++ ydb/core/fq/libs/compute/common/utils.h | 3 +-- ydb/core/fq/libs/config/protos/compute.proto | 1 + 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/ydb/core/fq/libs/compute/common/config.h b/ydb/core/fq/libs/compute/common/config.h index dec5b1a84f57..a381ee593fc6 100644 --- a/ydb/core/fq/libs/compute/common/config.h +++ b/ydb/core/fq/libs/compute/common/config.h @@ -108,6 +108,34 @@ class TComputeConfig { } } + NFq::NConfig::TYdbStorageConfig GetSchemeConnection(const TString& scope) const { + const auto& controlPlane = ComputeConfig.GetYdb().GetControlPlane(); + switch (controlPlane.type_case()) { + case NConfig::TYdbComputeControlPlane::TYPE_NOT_SET: + return {}; + case NConfig::TYdbComputeControlPlane::kSingle: + return controlPlane.GetSingle().GetConnection(); + case NConfig::TYdbComputeControlPlane::kCms: + return GetSchemeConnection(scope, controlPlane.GetCms().GetDatabaseMapping()); + case NConfig::TYdbComputeControlPlane::kYdbcp: + return GetSchemeConnection(scope, controlPlane.GetYdbcp().GetDatabaseMapping()); + } + } + + NFq::NConfig::TYdbStorageConfig GetSchemeConnection(const TString& scope, const ::NFq::NConfig::TDatabaseMapping& databaseMapping) const { + auto it = databaseMapping.GetScopeToComputeDatabase().find(scope); + if (it != databaseMapping.GetScopeToComputeDatabase().end()) { + return it->second.HasSchemeConnection() ? it->second.GetSchemeConnection() : it->second.GetExecutionConnection(); // TODO: for backward compatibility, cleanup it after migration + } + + if (databaseMapping.GetCommon().empty()) { + return NFq::NConfig::TYdbStorageConfig{}; + } + + auto config = databaseMapping.GetCommon(MultiHash(scope) % databaseMapping.GetCommon().size()); + return config.HasSchemeConnection() ? config.GetSchemeConnection() : config.GetExecutionConnection(); // TODO: for backward compatibility, cleanup it after migration + } + NFq::NConfig::TYdbStorageConfig GetExecutionConnection(const TString& scope, const ::NFq::NConfig::TDatabaseMapping& databaseMapping) const { auto it = databaseMapping.GetScopeToComputeDatabase().find(scope); if (it != databaseMapping.GetScopeToComputeDatabase().end()) { diff --git a/ydb/core/fq/libs/compute/common/utils.h b/ydb/core/fq/libs/compute/common/utils.h index 50a468141e85..8fd7d56419bc 100644 --- a/ydb/core/fq/libs/compute/common/utils.h +++ b/ydb/core/fq/libs/compute/common/utils.h @@ -17,8 +17,7 @@ inline std::shared_ptr CreateNewTableClient(const TS const ::NFq::NConfig::TYdbStorageConfig& connection, const TYqSharedResources::TPtr& yqSharedResources, const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory) { - - ::NFq::NConfig::TYdbStorageConfig computeConnection = computeConfig.GetExecutionConnection(scope); + ::NFq::NConfig::TYdbStorageConfig computeConnection = computeConfig.GetSchemeConnection(scope); computeConnection.set_endpoint(connection.endpoint()); computeConnection.set_database(connection.database()); computeConnection.set_usessl(connection.usessl()); diff --git a/ydb/core/fq/libs/config/protos/compute.proto b/ydb/core/fq/libs/config/protos/compute.proto index 60bc59125dde..31f2b6c77b9c 100644 --- a/ydb/core/fq/libs/config/protos/compute.proto +++ b/ydb/core/fq/libs/config/protos/compute.proto @@ -31,6 +31,7 @@ message TLoadControlConfig { message TComputeDatabaseConfig { TYdbStorageConfig ControlPlaneConnection = 1; + TYdbStorageConfig SchemeConnection = 6; TYdbStorageConfig ExecutionConnection = 3; string Tenant = 2; TLoadControlConfig LoadControlConfig = 4; From c5e5b9a1e4a4beeb0984f87dd7a684363ccd7c0d Mon Sep 17 00:00:00 2001 From: Vitaly Isaev Date: Tue, 3 Sep 2024 09:50:58 +0300 Subject: [PATCH 21/56] Merge #8604 (#8642) --- .../actors/query_utils.cpp | 1 + .../mdb_endpoint_generator.cpp | 3 +- .../datasource/clickhouse/docker-compose.yml | 2 +- .../datasource/postgresql/docker-compose.yml | 2 +- .../tests/datasource/ydb/docker-compose.yml | 2 +- .../connector/tests/join/docker-compose.yml | 2 +- .../connector/tests/utils/docker_compose.py | 2 +- ydb/tests/fq/generic/docker-compose.yml | 7 ++- ydb/tests/fq/generic/greenplum/Dockerfile | 4 ++ ydb/tests/fq/generic/greenplum/README.md | 1 + ydb/tests/fq/generic/greenplum/init_db.sh | 10 ++++ ydb/tests/fq/generic/test_greenplum.py | 47 +++++++++++++++++++ ydb/tests/fq/generic/test_join.py | 1 + .../fq/generic/utils/endpoint_determiner.py | 14 ++++-- ydb/tests/fq/generic/utils/settings.py | 17 +++++++ ydb/tests/fq/generic/ya.make | 1 + ydb/tests/tools/fq_runner/fq_client.py | 47 ++++++++++++------- ydb/tests/tools/fq_runner/kikimr_utils.py | 3 +- ydb/tests/tools/mdb_mock/__main__.py | 21 +++++++++ 19 files changed, 158 insertions(+), 29 deletions(-) create mode 100644 ydb/tests/fq/generic/greenplum/Dockerfile create mode 100644 ydb/tests/fq/generic/greenplum/README.md create mode 100644 ydb/tests/fq/generic/greenplum/init_db.sh create mode 100644 ydb/tests/fq/generic/test_greenplum.py diff --git a/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp b/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp index c69f279be3e3..9a24257e06e5 100644 --- a/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp +++ b/ydb/core/fq/libs/control_plane_proxy/actors/query_utils.cpp @@ -259,6 +259,7 @@ TString MakeCreateExternalDataSourceQuery( "schema"_a = gpschema ? ", SCHEMA=" + EncloseAndEscapeString(gpschema, '"') : TString{}); } + break; case FederatedQuery::ConnectionSetting::kMysqlCluster: { properties = fmt::format( R"( diff --git a/ydb/core/fq/libs/db_id_async_resolver_impl/mdb_endpoint_generator.cpp b/ydb/core/fq/libs/db_id_async_resolver_impl/mdb_endpoint_generator.cpp index 634d835070df..8c0523189eb1 100644 --- a/ydb/core/fq/libs/db_id_async_resolver_impl/mdb_endpoint_generator.cpp +++ b/ydb/core/fq/libs/db_id_async_resolver_impl/mdb_endpoint_generator.cpp @@ -13,9 +13,8 @@ namespace NFq { constexpr ui32 CLICKHOUSE_HTTP_SECURE_PORT = 8443; constexpr ui32 CLICKHOUSE_HTTP_INSECURE_PORT = 8123; - // Managed PostgreSQL provides the only port both for secure and insecure connections + // Managed PostgreSQL and Greenplum provide the only port both for secure and insecure connections constexpr ui32 POSTGRESQL_PORT = 6432; - constexpr ui32 GREENPLUM_PORT = 6432; constexpr ui32 MYSQL_PORT = 3306; diff --git a/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml index dd58ce80b4df..c8fb08a8927f 100644 --- a/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml @@ -12,7 +12,7 @@ services: - 8123 fq-connector-go: container_name: fq-tests-ch-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.5.0@sha256:6d3cec43478bef88dda195cd38c10e4df719c8ce6d13c9bd288c7ec40410e9d8 + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.7-rc.1@sha256:f12475f346105d7bc630e7b85f51dce980bf9833f6ce0625c6f1191b1a1de923 ports: - 2130 volumes: diff --git a/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml index de6e29ea8bf4..5069925061a1 100644 --- a/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml @@ -1,7 +1,7 @@ services: fq-connector-go: container_name: fq-tests-pg-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.5.0@sha256:6d3cec43478bef88dda195cd38c10e4df719c8ce6d13c9bd288c7ec40410e9d8 + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.7-rc.1@sha256:f12475f346105d7bc630e7b85f51dce980bf9833f6ce0625c6f1191b1a1de923 ports: - 2130 volumes: diff --git a/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml index 37a2dec71d1d..baf6c9e4ab3e 100644 --- a/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml @@ -5,7 +5,7 @@ services: echo \"$$(dig fq-tests-ydb-ydb +short) fq-tests-ydb-ydb\" >> /etc/hosts; cat /etc/hosts; /opt/ydb/bin/fq-connector-go server -c /opt/ydb/cfg/fq-connector-go.yaml" container_name: fq-tests-ydb-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.5.0@sha256:6d3cec43478bef88dda195cd38c10e4df719c8ce6d13c9bd288c7ec40410e9d8 + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.7-rc.1@sha256:f12475f346105d7bc630e7b85f51dce980bf9833f6ce0625c6f1191b1a1de923 ports: - 2130 volumes: diff --git a/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml index 8383a480bf6f..daf28b70f920 100644 --- a/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml @@ -12,7 +12,7 @@ services: - 8123 fq-connector-go: container_name: fq-tests-join-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.5.0@sha256:6d3cec43478bef88dda195cd38c10e4df719c8ce6d13c9bd288c7ec40410e9d8 + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.7-rc.1@sha256:f12475f346105d7bc630e7b85f51dce980bf9833f6ce0625c6f1191b1a1de923 ports: - 2130 volumes: diff --git a/ydb/library/yql/providers/generic/connector/tests/utils/docker_compose.py b/ydb/library/yql/providers/generic/connector/tests/utils/docker_compose.py index 305e8b666f66..fdb63938e23a 100644 --- a/ydb/library/yql/providers/generic/connector/tests/utils/docker_compose.py +++ b/ydb/library/yql/providers/generic/connector/tests/utils/docker_compose.py @@ -38,7 +38,7 @@ def __init__(self, docker_compose_yml_path: os.PathLike): self.docker_compose_yml_path = docker_compose_yml_path with open(self.docker_compose_yml_path) as f: - self.docker_compose_yml_data = yaml.load(f) + self.docker_compose_yml_data = yaml.load(f, Loader=yaml.FullLoader) def get_external_port(self, service_name: str, internal_port: int) -> int: cmd = [ diff --git a/ydb/tests/fq/generic/docker-compose.yml b/ydb/tests/fq/generic/docker-compose.yml index ce60fdfe7d63..19241b0b95aa 100644 --- a/ydb/tests/fq/generic/docker-compose.yml +++ b/ydb/tests/fq/generic/docker-compose.yml @@ -15,9 +15,14 @@ services: echo \"$$(dig tests-fq-generic-ydb +short) tests-fq-generic-ydb\" >> /etc/hosts; cat /etc/hosts; /opt/ydb/bin/fq-connector-go server -c /opt/ydb/cfg/fq-connector-go.yaml" container_name: tests-fq-generic-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.5.0@sha256:6d3cec43478bef88dda195cd38c10e4df719c8ce6d13c9bd288c7ec40410e9d8 + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.7-rc.1@sha256:f12475f346105d7bc630e7b85f51dce980bf9833f6ce0625c6f1191b1a1de923 ports: - "2130" + greenplum: + container_name: tests-fq-generic-greenplum + image: ghcr.io/ydb-platform/fq-connector-go_greenplum:6.25.3-6432@sha256:9e862b05719b289b447562fbce6c003916a764a549f924a4175eecd7e7891a0b + volumes: + - ./greenplum/init_db.sh:/init_db.sh postgresql: command: -p 6432 container_name: tests-fq-generic-postgresql diff --git a/ydb/tests/fq/generic/greenplum/Dockerfile b/ydb/tests/fq/generic/greenplum/Dockerfile new file mode 100644 index 000000000000..7818e2e8cd19 --- /dev/null +++ b/ydb/tests/fq/generic/greenplum/Dockerfile @@ -0,0 +1,4 @@ +FROM ghcr.io/ydb-platform/fq-connector-go_greenplum:6.25.3@sha256:0627a657b179ff73949fec05201f3e164b92639281eff248cd07669ce7247eec + +# For the sake of simplicity of tests, we force Greenplum to use the same port that it uses within MDB +RUN find /data -type f -exec sed -i 's/5432/6432/' "{}" +; diff --git a/ydb/tests/fq/generic/greenplum/README.md b/ydb/tests/fq/generic/greenplum/README.md new file mode 100644 index 000000000000..e94189b1b16c --- /dev/null +++ b/ydb/tests/fq/generic/greenplum/README.md @@ -0,0 +1 @@ +Docker image built from this Dockerfile is pushed as `ghcr.io/ydb-platform/fq-connector-go_greenplum:6.25.3-6432@sha256:9e862b05719b289b447562fbce6c003916a764a549f924a4175eecd7e7891a0b`. No need to rebuild them every time. diff --git a/ydb/tests/fq/generic/greenplum/init_db.sh b/ydb/tests/fq/generic/greenplum/init_db.sh new file mode 100644 index 000000000000..f1a2cc7dbe4b --- /dev/null +++ b/ydb/tests/fq/generic/greenplum/init_db.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -e + +psql -p 6432 -v ON_ERROR_STOP=1 --username gpadmin --dbname template1 <<-EOSQL + CREATE TABLE simple_table (number INT); + INSERT INTO simple_table VALUES ((3)), ((14)), ((15)); + + CREATE TABLE join_table (id INT, data bytea); + INSERT INTO join_table VALUES (1, 'gp10'), (2, 'gp20'), (3, 'gp30'); +EOSQL diff --git a/ydb/tests/fq/generic/test_greenplum.py b/ydb/tests/fq/generic/test_greenplum.py new file mode 100644 index 000000000000..ff4424a1f821 --- /dev/null +++ b/ydb/tests/fq/generic/test_greenplum.py @@ -0,0 +1,47 @@ +import logging +import pytest + +import ydb.public.api.protos.draft.fq_pb2 as fq +import ydb.public.api.protos.ydb_value_pb2 as ydb +from ydb.tests.tools.fq_runner.kikimr_utils import yq_v2 + +from ydb.tests.tools.fq_runner.fq_client import FederatedQueryClient +from ydb.tests.fq.generic.utils.settings import Settings + + +class TestGreenplum: + @yq_v2 + @pytest.mark.parametrize("fq_client", [{"folder_id": "my_folder"}], indirect=True) + def test_simple(self, fq_client: FederatedQueryClient, settings: Settings): + table_name = "simple_table" + conn_name = f"conn_{table_name}" + query_name = f"query_{table_name}" + + fq_client.create_greenplum_connection( + name=conn_name, + database_name=settings.greenplum.dbname, + database_id="greenplum_cluster_id", + login=settings.greenplum.username, + password=settings.greenplum.password, + ) + + sql = Rf""" + SELECT * + FROM {conn_name}.{table_name} ORDER BY number; + """ + + query_id = fq_client.create_query(query_name, sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id + fq_client.wait_query_status(query_id, fq.QueryMeta.COMPLETED) + + data = fq_client.get_result_data(query_id) + result_set = data.result.result_set + logging.debug(str(result_set)) + assert len(result_set.columns) == 1 + assert result_set.columns[0].name == "number" + assert result_set.columns[0].type == ydb.Type( + optional_type=ydb.OptionalType(item=ydb.Type(type_id=ydb.Type.INT32)) + ) + assert len(result_set.rows) == 3 + assert result_set.rows[0].items[0].int32_value == 3 + assert result_set.rows[1].items[0].int32_value == 14 + assert result_set.rows[2].items[0].int32_value == 15 diff --git a/ydb/tests/fq/generic/test_join.py b/ydb/tests/fq/generic/test_join.py index 5100d83b7205..fff9b00042c1 100644 --- a/ydb/tests/fq/generic/test_join.py +++ b/ydb/tests/fq/generic/test_join.py @@ -41,6 +41,7 @@ def test_simple(self, fq_client: FederatedQueryClient, settings: Settings, query database_id=settings.ydb.dbname, ) + # FIXME: research why test starts failing if we add Greenplum sql = fR''' SELECT pg.data AS data_pg, ch.data AS data_ch, ydb.data AS data_ydb FROM {pg_conn_name}.{table_name} AS pg diff --git a/ydb/tests/fq/generic/utils/endpoint_determiner.py b/ydb/tests/fq/generic/utils/endpoint_determiner.py index a8c146443c54..7b822010ac25 100644 --- a/ydb/tests/fq/generic/utils/endpoint_determiner.py +++ b/ydb/tests/fq/generic/utils/endpoint_determiner.py @@ -4,20 +4,26 @@ import yatest.common -# TODO: avoid duplication with ydb/library/yql/providers/generic/connector/tests/utils/docker_compose.py class EndpointDeterminer: docker_compose_bin: os.PathLike docker_compose_yml: os.PathLike def __init__(self, docker_compose_yml: os.PathLike): - self.docker_compose_bin = yatest.common.build_path('library/recipes/docker_compose/bin/docker-compose') + self.docker_compose_bin = yatest.common.build_path("library/recipes/docker_compose/bin/docker-compose") self.docker_compose_yml = docker_compose_yml def get_port(self, service_name: str, internal_port: int) -> int: - cmd = [self.docker_compose_bin, '-f', self.docker_compose_yml, 'port', service_name, str(internal_port)] + cmd = [ + self.docker_compose_bin, + "-f", + self.docker_compose_yml, + "port", + service_name, + str(internal_port), + ] try: out = subprocess.check_output(cmd, stderr=subprocess.STDOUT) - external_port = int(out.split(b':')[1]) + external_port = int(out.split(b":")[1]) return external_port except subprocess.CalledProcessError as e: raise RuntimeError(f"docker-compose error: {e.output} (code {e.returncode})") diff --git a/ydb/tests/fq/generic/utils/settings.py b/ydb/tests/fq/generic/utils/settings.py index e231b219bfb8..a2efdc2a2480 100644 --- a/ydb/tests/fq/generic/utils/settings.py +++ b/ydb/tests/fq/generic/utils/settings.py @@ -9,6 +9,8 @@ @dataclass class Settings: + # infrastructure services + @dataclass class Connector: grpc_host: str @@ -29,6 +31,8 @@ class TokenAccessorMock: token_accessor_mock: TokenAccessorMock + # databases + @dataclass class ClickHouse: dbname: str @@ -38,6 +42,14 @@ class ClickHouse: clickhouse: ClickHouse + @dataclass + class Greenplum: + dbname: str + username: str + password: str + + greenplum: Greenplum + @dataclass class PostgreSQL: dbname: str @@ -77,6 +89,11 @@ def from_env(cls) -> 'Settings': password='password', protocol='native', ), + greenplum=cls.Greenplum( + dbname='template1', + username='gpadmin', + password='123456', + ), postgresql=cls.PostgreSQL( dbname='db', username='user', diff --git a/ydb/tests/fq/generic/ya.make b/ydb/tests/fq/generic/ya.make index c6dcdaadd034..06b3bdfc9644 100644 --- a/ydb/tests/fq/generic/ya.make +++ b/ydb/tests/fq/generic/ya.make @@ -64,6 +64,7 @@ PEERDIR( TEST_SRCS( conftest.py test_clickhouse.py + test_greenplum.py test_join.py test_postgresql.py test_streaming_join.py diff --git a/ydb/tests/tools/fq_runner/fq_client.py b/ydb/tests/tools/fq_runner/fq_client.py index b34029d09eca..001b7cf3788d 100644 --- a/ydb/tests/tools/fq_runner/fq_client.py +++ b/ydb/tests/tools/fq_runner/fq_client.py @@ -421,22 +421,6 @@ def create_yds_connection(self, name, database=None, endpoint=None, database_id= request.content.acl.visibility = visibility return self.create_connection(request, check_issues) - @retry.retry_intrusive - def create_postgresql_connection(self, name, database_name, database_id, login, password, - secure=False, visibility=fq.Acl.Visibility.PRIVATE, auth_method=AuthMethod.service_account('sa'), check_issues=True): - request = fq.CreateConnectionRequest() - request.content.name = name - pg = request.content.setting.postgresql_cluster - pg.database_name = database_name - pg.database_id = database_id - pg.secure = secure - pg.login = login - pg.password = password - - pg.auth.CopyFrom(auth_method) - request.content.acl.visibility = visibility - return self.create_connection(request, check_issues) - @retry.retry_intrusive def create_clickhouse_connection(self, name, database_name, database_id, login, password, secure=False, visibility=fq.Acl.Visibility.PRIVATE, auth_method=AuthMethod.service_account('sa'), check_issues=True): @@ -453,6 +437,37 @@ def create_clickhouse_connection(self, name, database_name, database_id, login, request.content.acl.visibility = visibility return self.create_connection(request, check_issues) + @retry.retry_intrusive + def create_greenplum_connection(self, name, database_name, database_id, login, password, + secure=False, visibility=fq.Acl.Visibility.PRIVATE, auth_method=AuthMethod.service_account('sa'), check_issues=True): + request = fq.CreateConnectionRequest() + request.content.name = name + gp = request.content.setting.greenplum_cluster + gp.database_name = database_name + gp.database_id = database_id + gp.login = login + gp.password = password + + gp.auth.CopyFrom(auth_method) + request.content.acl.visibility = visibility + return self.create_connection(request, check_issues) + + @retry.retry_intrusive + def create_postgresql_connection(self, name, database_name, database_id, login, password, + secure=False, visibility=fq.Acl.Visibility.PRIVATE, auth_method=AuthMethod.service_account('sa'), check_issues=True): + request = fq.CreateConnectionRequest() + request.content.name = name + pg = request.content.setting.postgresql_cluster + pg.database_name = database_name + pg.database_id = database_id + pg.secure = secure + pg.login = login + pg.password = password + + pg.auth.CopyFrom(auth_method) + request.content.acl.visibility = visibility + return self.create_connection(request, check_issues) + @retry.retry_intrusive def list_connections(self, visibility, name_substring=None, limit=100, check_issues=True, page_token=""): request = fq.ListConnectionsRequest() diff --git a/ydb/tests/tools/fq_runner/kikimr_utils.py b/ydb/tests/tools/fq_runner/kikimr_utils.py index 02110b16351a..b3f86b84a3d8 100644 --- a/ydb/tests/tools/fq_runner/kikimr_utils.py +++ b/ydb/tests/tools/fq_runner/kikimr_utils.py @@ -271,8 +271,9 @@ def is_applicable(self, request): def apply_to_kikimr(self, request, kikimr): kikimr.control_plane.fq_config['common']['disable_ssl_for_generic_data_sources'] = True - kikimr.control_plane.fq_config['control_plane_storage']['available_connection'].append('POSTGRESQL_CLUSTER') kikimr.control_plane.fq_config['control_plane_storage']['available_connection'].append('CLICKHOUSE_CLUSTER') + kikimr.control_plane.fq_config['control_plane_storage']['available_connection'].append('GREENPLUM_CLUSTER') + kikimr.control_plane.fq_config['control_plane_storage']['available_connection'].append('POSTGRESQL_CLUSTER') kikimr.control_plane.fq_config['control_plane_storage']['available_connection'].append('YDB_DATABASE') generic = { diff --git a/ydb/tests/tools/mdb_mock/__main__.py b/ydb/tests/tools/mdb_mock/__main__.py index 7201e629482c..5e73ad2e145a 100644 --- a/ydb/tests/tools/mdb_mock/__main__.py +++ b/ydb/tests/tools/mdb_mock/__main__.py @@ -53,10 +53,31 @@ async def postgresql_handler(request): return web.Response(body=json.dumps({})) +async def greenplum_handler(request): + cluster_id = request.match_info['cluster_id'] + + if cluster_id == 'greenplum_cluster_id': + return web.Response( + body=json.dumps( + { + "hosts": [ + { + "name": "greenplum", + "type": "MASTER", + "health": "ALIVE", + } + ] + } + ) + ) + return web.Response(body=json.dumps({})) + + def serve(port: int): app = web.Application() app.add_routes([web.get('/managed-clickhouse/v1/clusters/{cluster_id}/hosts', clickhouse_handler)]) app.add_routes([web.get('/managed-postgresql/v1/clusters/{cluster_id}/hosts', postgresql_handler)]) + app.add_routes([web.get('/managed-greenplum/v1/clusters/{cluster_id}/master-hosts', greenplum_handler)]) web.run_app(app, port=port) From 12d1cd421c6c5a5ca80cd2c87b817383313deae5 Mon Sep 17 00:00:00 2001 From: yumkam Date: Wed, 4 Sep 2024 16:07:17 +0300 Subject: [PATCH 22/56] Fix streamlookupjoin (backport #8422) (#8540) --- .../dq_input_transform_lookup.cpp | 2 +- ydb/tests/fq/generic/test_streaming_join.py | 328 ++++++++++++++++++ ydb/tests/fq/generic/ydb/01_basic.sh | 17 + ydb/tests/tools/fq_runner/kikimr_runner.py | 1 + 4 files changed, 347 insertions(+), 1 deletion(-) diff --git a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp index 646f49130507..9f2f52f17bb8 100644 --- a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp +++ b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp @@ -111,7 +111,7 @@ class TInputTransformStreamLookupBase outputRowItems[i] = wideInputRow[index]; break; case EOutputRowItemSource::LookupKey: - outputRowItems[i] = lookupKey.GetElement(index); + outputRowItems[i] = lookupPayload && *lookupPayload ? lookupKey.GetElement(index) : NUdf::TUnboxedValue {}; break; case EOutputRowItemSource::LookupOther: if (lookupPayload && *lookupPayload) { diff --git a/ydb/tests/fq/generic/test_streaming_join.py b/ydb/tests/fq/generic/test_streaming_join.py index cf9250def41f..dc5e54d85a46 100644 --- a/ydb/tests/fq/generic/test_streaming_join.py +++ b/ydb/tests/fq/generic/test_streaming_join.py @@ -1,5 +1,7 @@ import pytest import os +import json +import sys import ydb.public.api.protos.draft.fq_pb2 as fq from ydb.tests.tools.fq_runner.kikimr_utils import yq_v1 @@ -8,6 +10,275 @@ from ydb.tests.tools.datastreams_helpers.test_yds_base import TestYdsBase from ydb.tests.fq.generic.utils.settings import Settings +DEBUG = 0 +TESTCASES = [ + # 0 + ( + R''' + $input = SELECT * FROM myyds.`{input_topic}`; + + $enriched = select + e.Data as data, u.id as lookup + from + $input as e + left join {streamlookup} ydb_conn_{table_name}.{table_name} as u + on(e.Data = u.data) + ; + + insert into myyds.`{output_topic}` + select Unwrap(Yson::SerializeJson(Yson::From(TableRow()))) from $enriched; + ''', + [ + ('ydb10', '{"data":"ydb10","lookup":1}'), + ('ydb20', '{"data":"ydb20","lookup":2}'), + ('ydb30', '{"data":"ydb30","lookup":3}'), + ('ydb40', '{"data":"ydb40","lookup":null}'), + ('ydb50', '{"data":"ydb50","lookup":null}'), + ('ydb10', '{"data":"ydb10","lookup":1}'), + ('ydb20', '{"data":"ydb20","lookup":2}'), + ('ydb30', '{"data":"ydb30","lookup":3}'), + ('ydb40', '{"data":"ydb40","lookup":null}'), + ('ydb50', '{"data":"ydb50","lookup":null}'), + ] + * 10, + ), + # 1 + ( + R''' + $input = SELECT * FROM myyds.`{input_topic}`; + + $enriched = select + e.Data as data, CAST(e.Data AS Int32) as id, u.data as lookup + from + $input as e + left join {streamlookup} ydb_conn_{table_name}.{table_name} as u + on(CAST(e.Data AS Int32) = u.id) + ; + + insert into myyds.`{output_topic}` + select Unwrap(Yson::SerializeJson(Yson::From(TableRow()))) from $enriched; + ''', + [ + ('1', '{"data":"1","id":1,"lookup":"ydb10"}'), + ('2', '{"data":"2","id":2,"lookup":"ydb20"}'), + ('3', '{"data":"3","id":3,"lookup":"ydb30"}'), + ('4', '{"data":"4","id":4,"lookup":null}'), + ('5', '{"data":"5","id":5,"lookup":null}'), + ('1', '{"data":"1","id":1,"lookup":"ydb10"}'), + ('2', '{"data":"2","id":2,"lookup":"ydb20"}'), + ('3', '{"data":"3","id":3,"lookup":"ydb30"}'), + ('4', '{"data":"4","id":4,"lookup":null}'), + ('5', '{"data":"5","id":5,"lookup":null}'), + ] + * 3, + ), + # 2 + ( + R''' + $input = SELECT * FROM myyds.`{input_topic}` + WITH ( + FORMAT=json_each_row, + SCHEMA ( + id Int32, + user Int32, + ) + ) ; + + $enriched = select e.id as id, + e.user as user_id, + u.data as lookup + from + $input as e + left join {streamlookup} ydb_conn_{table_name}.{table_name} as u + on(e.user = u.id) + ; + + insert into myyds.`{output_topic}` + select Unwrap(Yson::SerializeJson(Yson::From(TableRow()))) from $enriched; + ''', + [ + ('{"id":3,"user":5}', '{"id":3,"user_id":5,"lookup":null}'), + ('{"id":9,"user":3}', '{"id":9,"user_id":3,"lookup":"ydb30"}'), + ('{"id":2,"user":2}', '{"id":2,"user_id":2,"lookup":"ydb20"}'), + ('{"id":1,"user":1}', '{"id":1,"user_id":1,"lookup":"ydb10"}'), + ('{"id":4,"user":3}', '{"id":4,"user_id":3,"lookup":"ydb30"}'), + ('{"id":5,"user":3}', '{"id":5,"user_id":3,"lookup":"ydb30"}'), + ('{"id":6,"user":1}', '{"id":6,"user_id":1,"lookup":"ydb10"}'), + ('{"id":7,"user":2}', '{"id":7,"user_id":2,"lookup":"ydb20"}'), + ] + * 20, + ), + # 3 + ( + R''' + $input = SELECT * FROM myyds.`{input_topic}` + WITH ( + FORMAT=json_each_row, + SCHEMA ( + id Int32, + ts String, + ev_type String, + user Int32, + ) + ) ; + + $formatTime = DateTime::Format("%H:%M:%S"); + + $enriched = select e.id as id, + $formatTime(DateTime::ParseIso8601(e.ts)) as ts, + e.user as user_id, + u.data as lookup + from + $input as e + left join {streamlookup} ydb_conn_{table_name}.{table_name} as u + on(e.user = u.id) + ; + + insert into myyds.`{output_topic}` + select Unwrap(Yson::SerializeJson(Yson::From(TableRow()))) from $enriched; + ''', + [ + ( + '{"id":2,"ts":"20240701T113344","ev_type":"foo1","user":2}', + '{"id":2,"ts":"11:33:44","user_id":2,"lookup":"ydb20"}', + ), + ( + '{"id":1,"ts":"20240701T112233","ev_type":"foo2","user":1}', + '{"id":1,"ts":"11:22:33","user_id":1,"lookup":"ydb10"}', + ), + ( + '{"id":3,"ts":"20240701T113355","ev_type":"foo3","user":5}', + '{"id":3,"ts":"11:33:55","user_id":5,"lookup":null}', + ), + ( + '{"id":4,"ts":"20240701T113356","ev_type":"foo4","user":3}', + '{"id":4,"ts":"11:33:56","user_id":3,"lookup":"ydb30"}', + ), + ( + '{"id":5,"ts":"20240701T113357","ev_type":"foo5","user":3}', + '{"id":5,"ts":"11:33:57","user_id":3,"lookup":"ydb30"}', + ), + ( + '{"id":6,"ts":"20240701T112238","ev_type":"foo6","user":1}', + '{"id":6,"ts":"11:22:38","user_id":1,"lookup":"ydb10"}', + ), + ( + '{"id":7,"ts":"20240701T113349","ev_type":"foo7","user":2}', + '{"id":7,"ts":"11:33:49","user_id":2,"lookup":"ydb20"}', + ), + ] + * 10, + ), + # 4 + ( + R''' + $input = SELECT * FROM myyds.`{input_topic}` + WITH ( + FORMAT=json_each_row, + SCHEMA ( + id Int32, + ts String, + ev_type String, + user Int32, + ) + ) ; + + $formatTime = DateTime::Format("%H:%M:%S"); + + $enriched = select e.id as id, + $formatTime(DateTime::ParseIso8601(e.ts)) as ts, + e.user as user_id, + u.id as uid, + u.name as name, + u.age as age + from + $input as e + left join {streamlookup} ydb_conn_{table_name}.`users` as u + on(e.user = u.id) + ; + + insert into myyds.`{output_topic}` + select Unwrap(Yson::SerializeJson(Yson::From(TableRow()))) from $enriched; + ''', + [ + ( + '{"id":1,"ts":"20240701T113344","ev_type":"foo1","user":2}', + '{"id":1,"ts":"11:33:44","uid":2,"user_id":2,"name":"Petr","age":25}', + ), + ( + '{"id":2,"ts":"20240701T112233","ev_type":"foo2","user":1}', + '{"id":2,"ts":"11:22:33","uid":1,"user_id":1,"name":"Anya","age":15}', + ), + ( + '{"id":3,"ts":"20240701T113355","ev_type":"foo3","user":100}', + '{"id":3,"ts":"11:33:55","uid":null,"user_id":100,"name":null,"age":null}', + ), + ( + '{"id":4,"ts":"20240701T113356","ev_type":"foo4","user":3}', + '{"id":4,"ts":"11:33:56","uid":3,"user_id":3,"name":"Masha","age":17}', + ), + ( + '{"id":5,"ts":"20240701T113357","ev_type":"foo5","user":3}', + '{"id":5,"ts":"11:33:57","uid":3,"user_id":3,"name":"Masha","age":17}', + ), + ( + '{"id":6,"ts":"20240701T112238","ev_type":"foo6","user":1}', + '{"id":6,"ts":"11:22:38","uid":1,"user_id":1,"name":"Anya","age":15}', + ), + ( + '{"id":7,"ts":"20240701T113349","ev_type":"foo7","user":2}', + '{"id":7,"ts":"11:33:49","uid":2,"user_id":2,"name":"Petr","age":25}', + ), + ] + * 1000, + ), + # 5 + ( + R''' + $input = SELECT * FROM myyds.`{input_topic}` + WITH ( + FORMAT=json_each_row, + SCHEMA ( + id Int32, + ts String, + ev_type String, + user Int32, + ) + ) ; + + $enriched = select e.id as id, + e.user as user_id, + u.id as uid + from + $input as e + left join {streamlookup} ydb_conn_{table_name}.`users` as u + on(e.user = u.id) + ; + + insert into myyds.`{output_topic}` + select Unwrap(Yson::SerializeJson(Yson::From(TableRow()))) from $enriched; + ''', + [ + ( + '{"id":1,"ts":"20240701T113344","ev_type":"foo1","user":2}', + '{"id":1,"uid":2,"user_id":2}', + ), + ( + '{"id":2,"ts":"20240701T112233","ev_type":"foo2","user":1}', + '{"id":2,"uid":1,"user_id":1}', + ), + ( + '{"id":3,"ts":"20240701T113355","ev_type":"foo3","user":100}', + '{"id":3,"uid":null,"user_id":100}', + ), + ( + '{"id":4,"ts":"20240701T113356","ev_type":"foo4","user":3}', + '{"id":4,"uid":3,"user_id":3}', + ), + ], + ), +] + class TestStreamingJoin(TestYdsBase): @yq_v1 @@ -59,3 +330,60 @@ def test_simple(self, kikimr, fq_client: FederatedQueryClient, settings: Setting status = describe_response.result.query.meta.status assert not describe_response.issues, str(describe_response.issues) assert status == fq.QueryMeta.ABORTED_BY_USER, fq.QueryMeta.ComputeStatus.Name(status) + + @yq_v1 + @pytest.mark.parametrize("mvp_external_ydb_endpoint", [{"endpoint": "tests-fq-generic-ydb:2136"}], indirect=True) + @pytest.mark.parametrize("fq_client", [{"folder_id": "my_folder_slj"}], indirect=True) + @pytest.mark.parametrize("streamlookup", [False, True]) + @pytest.mark.parametrize("testcase", [*range(len(TESTCASES))]) + def test_streamlookup( + self, kikimr, testcase, streamlookup, fq_client: FederatedQueryClient, settings: Settings, yq_version + ): + self.init_topics(f"pq_yq_streaming_test_lookup_{streamlookup}{testcase}_{yq_version}") + fq_client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) + + table_name = 'join_table' + ydb_conn_name = f'ydb_conn_{table_name}' + + fq_client.create_ydb_connection( + name=ydb_conn_name, + database_id=settings.ydb.dbname, + ) + + sql, messages = TESTCASES[testcase] + sql = sql.format( + input_topic=self.input_topic, + output_topic=self.output_topic, + table_name=table_name, + streamlookup=R'/*+ streamlookup() */' if streamlookup else '', + ) + + query_id = fq_client.create_query( + f"streamlookup_{streamlookup}{testcase}", sql, type=fq.QueryContent.QueryType.STREAMING + ).result.query_id + fq_client.wait_query_status(query_id, fq.QueryMeta.RUNNING) + kikimr.compute_plane.wait_zero_checkpoint(query_id) + + offset = 0 + while offset < len(messages): + chunk = messages[offset : offset + 500] + self.write_stream(map(lambda x: x[0], chunk)) + offset += 500 + + read_data = self.read_stream(len(messages)) + if DEBUG: + print(streamlookup, testcase, file=sys.stderr) + print(sql, file=sys.stderr) + print(*zip(messages, read_data), file=sys.stderr, sep="\n") + for r, exp in zip(read_data, messages): + r = json.loads(r) + exp = json.loads(exp[1]) + assert r == exp + + fq_client.abort_query(query_id) + fq_client.wait_query(query_id) + + describe_response = fq_client.describe_query(query_id) + status = describe_response.result.query.meta.status + assert not describe_response.issues, str(describe_response.issues) + assert status == fq.QueryMeta.ABORTED_BY_USER, fq.QueryMeta.ComputeStatus.Name(status) diff --git a/ydb/tests/fq/generic/ydb/01_basic.sh b/ydb/tests/fq/generic/ydb/01_basic.sh index 2c94ade1c561..2a54e14f6b47 100755 --- a/ydb/tests/fq/generic/ydb/01_basic.sh +++ b/ydb/tests/fq/generic/ydb/01_basic.sh @@ -18,6 +18,23 @@ set -ex (2, "ydb20"), (3, "ydb30"); COMMIT; + CREATE TABLE users (age Int32, id Int32, ip STRING, name STRING, region Int32, PRIMARY KEY(id)); + COMMIT; + INSERT INTO users (age, id, ip, name, region) VALUES + (15, 1, "95.106.17.32", "Anya", 213), + (25, 2, "88.78.248.151", "Petr", 225), + (17, 3, "93.94.183.63", "Masha", 1), + (5, 4, "::ffff:193.34.173.188", "Alena", 225), + (15, 5, "93.170.111.29", "Irina", 2), + (13, 6, "93.170.111.28", "Inna", 21), + (33, 7, "::ffff:193.34.173.173", "Ivan", 125), + (45, 8, "::ffff:133.34.173.188", "Asya", 225), + (27, 9, "::ffff:133.34.172.188", "German", 125), + (41, 10, "::ffff:133.34.173.185", "Olya", 225), + (35, 11, "::ffff:193.34.163.188", "Slava", 2), + (56, 12, "2a02:1812:1713:4f00:517e:1d79:c88b:704", "Elena", 2), + (18, 17, "ivalid ip", "newUser", 12); + COMMIT; ' retVal=$? diff --git a/ydb/tests/tools/fq_runner/kikimr_runner.py b/ydb/tests/tools/fq_runner/kikimr_runner.py index 3161591f2c78..d0480e8dd533 100644 --- a/ydb/tests/tools/fq_runner/kikimr_runner.py +++ b/ydb/tests/tools/fq_runner/kikimr_runner.py @@ -146,6 +146,7 @@ def fill_gateways_cfg(self, gateways): gateways['yql_core'] = {} gateways['yql_core']['flags'] = [] gateways['yql_core']['flags'].append({'name': "_EnableMatchRecognize"}) + gateways['yql_core']['flags'].append({'name': "_EnableStreamLookupJoin"}) def fill_storage_config(self, storage, directory): storage['endpoint'] = os.getenv("YDB_ENDPOINT") From 0fb80c7d4d8f6fb8481a9ceac472b92334df3b6f Mon Sep 17 00:00:00 2001 From: yumkam Date: Wed, 4 Sep 2024 17:24:01 +0300 Subject: [PATCH 23/56] streamlookupjoin: fix joining from multi-partition stream (backport #8622) (#8737) --- .../yql/dq/tasks/dq_connection_builder.h | 2 +- ydb/tests/fq/generic/test_streaming_join.py | 205 +++++++++++------- 2 files changed, 125 insertions(+), 82 deletions(-) diff --git a/ydb/library/yql/dq/tasks/dq_connection_builder.h b/ydb/library/yql/dq/tasks/dq_connection_builder.h index 99448deb3660..0670eba446bd 100644 --- a/ydb/library/yql/dq/tasks/dq_connection_builder.h +++ b/ydb/library/yql/dq/tasks/dq_connection_builder.h @@ -216,7 +216,7 @@ void BuildStreamLookupChannels(TGraph& graph, const NNodes::TDqPhyStage& stage, auto& originStageInfo = graph.GetStageInfo(cnStreamLookup.Output().Stage()); auto outputIndex = FromString(cnStreamLookup.Output().Index().Value()); - BuildMapChannels(graph, stageInfo, inputIndex, originStageInfo, outputIndex, false /*spilling*/, logFunc); + BuildUnionAllChannels(graph, stageInfo, inputIndex, originStageInfo, outputIndex, false /*spilling*/, logFunc); } template diff --git a/ydb/tests/fq/generic/test_streaming_join.py b/ydb/tests/fq/generic/test_streaming_join.py index dc5e54d85a46..16269e2129cf 100644 --- a/ydb/tests/fq/generic/test_streaming_join.py +++ b/ydb/tests/fq/generic/test_streaming_join.py @@ -2,6 +2,8 @@ import os import json import sys +from collections import Counter +from operator import itemgetter import ydb.public.api.protos.draft.fq_pb2 as fq from ydb.tests.tools.fq_runner.kikimr_utils import yq_v1 @@ -11,6 +13,31 @@ from ydb.tests.fq.generic.utils.settings import Settings DEBUG = 0 + + +def ResequenceId(messages): + res = [] + i = 1 + for pair in messages: + rpair = [] + for it in pair: + src = json.loads(it) + src["id"] = i + rpair += [json.dumps(src)] + res += [tuple(rpair)] + i += 1 + return res + + +def freeze(json): + t = type(json) + if t == dict: + return frozenset((k, freeze(v)) for k, v in json.items()) + if t == list: + return tuple(map(freeze, json)) + return json + + TESTCASES = [ # 0 ( @@ -96,17 +123,19 @@ insert into myyds.`{output_topic}` select Unwrap(Yson::SerializeJson(Yson::From(TableRow()))) from $enriched; ''', - [ - ('{"id":3,"user":5}', '{"id":3,"user_id":5,"lookup":null}'), - ('{"id":9,"user":3}', '{"id":9,"user_id":3,"lookup":"ydb30"}'), - ('{"id":2,"user":2}', '{"id":2,"user_id":2,"lookup":"ydb20"}'), - ('{"id":1,"user":1}', '{"id":1,"user_id":1,"lookup":"ydb10"}'), - ('{"id":4,"user":3}', '{"id":4,"user_id":3,"lookup":"ydb30"}'), - ('{"id":5,"user":3}', '{"id":5,"user_id":3,"lookup":"ydb30"}'), - ('{"id":6,"user":1}', '{"id":6,"user_id":1,"lookup":"ydb10"}'), - ('{"id":7,"user":2}', '{"id":7,"user_id":2,"lookup":"ydb20"}'), - ] - * 20, + ResequenceId( + [ + ('{"id":3,"user":5}', '{"id":3,"user_id":5,"lookup":null}'), + ('{"id":9,"user":3}', '{"id":9,"user_id":3,"lookup":"ydb30"}'), + ('{"id":2,"user":2}', '{"id":2,"user_id":2,"lookup":"ydb20"}'), + ('{"id":1,"user":1}', '{"id":1,"user_id":1,"lookup":"ydb10"}'), + ('{"id":4,"user":3}', '{"id":4,"user_id":3,"lookup":"ydb30"}'), + ('{"id":5,"user":3}', '{"id":5,"user_id":3,"lookup":"ydb30"}'), + ('{"id":6,"user":1}', '{"id":6,"user_id":1,"lookup":"ydb10"}'), + ('{"id":7,"user":2}', '{"id":7,"user_id":2,"lookup":"ydb20"}'), + ] + * 20 + ), ), # 3 ( @@ -137,37 +166,39 @@ insert into myyds.`{output_topic}` select Unwrap(Yson::SerializeJson(Yson::From(TableRow()))) from $enriched; ''', - [ - ( - '{"id":2,"ts":"20240701T113344","ev_type":"foo1","user":2}', - '{"id":2,"ts":"11:33:44","user_id":2,"lookup":"ydb20"}', - ), - ( - '{"id":1,"ts":"20240701T112233","ev_type":"foo2","user":1}', - '{"id":1,"ts":"11:22:33","user_id":1,"lookup":"ydb10"}', - ), - ( - '{"id":3,"ts":"20240701T113355","ev_type":"foo3","user":5}', - '{"id":3,"ts":"11:33:55","user_id":5,"lookup":null}', - ), - ( - '{"id":4,"ts":"20240701T113356","ev_type":"foo4","user":3}', - '{"id":4,"ts":"11:33:56","user_id":3,"lookup":"ydb30"}', - ), - ( - '{"id":5,"ts":"20240701T113357","ev_type":"foo5","user":3}', - '{"id":5,"ts":"11:33:57","user_id":3,"lookup":"ydb30"}', - ), - ( - '{"id":6,"ts":"20240701T112238","ev_type":"foo6","user":1}', - '{"id":6,"ts":"11:22:38","user_id":1,"lookup":"ydb10"}', - ), - ( - '{"id":7,"ts":"20240701T113349","ev_type":"foo7","user":2}', - '{"id":7,"ts":"11:33:49","user_id":2,"lookup":"ydb20"}', - ), - ] - * 10, + ResequenceId( + [ + ( + '{"id":2,"ts":"20240701T113344","ev_type":"foo1","user":2}', + '{"id":2,"ts":"11:33:44","user_id":2,"lookup":"ydb20"}', + ), + ( + '{"id":1,"ts":"20240701T112233","ev_type":"foo2","user":1}', + '{"id":1,"ts":"11:22:33","user_id":1,"lookup":"ydb10"}', + ), + ( + '{"id":3,"ts":"20240701T113355","ev_type":"foo3","user":5}', + '{"id":3,"ts":"11:33:55","user_id":5,"lookup":null}', + ), + ( + '{"id":4,"ts":"20240701T113356","ev_type":"foo4","user":3}', + '{"id":4,"ts":"11:33:56","user_id":3,"lookup":"ydb30"}', + ), + ( + '{"id":5,"ts":"20240701T113357","ev_type":"foo5","user":3}', + '{"id":5,"ts":"11:33:57","user_id":3,"lookup":"ydb30"}', + ), + ( + '{"id":6,"ts":"20240701T112238","ev_type":"foo6","user":1}', + '{"id":6,"ts":"11:22:38","user_id":1,"lookup":"ydb10"}', + ), + ( + '{"id":7,"ts":"20240701T113349","ev_type":"foo7","user":2}', + '{"id":7,"ts":"11:33:49","user_id":2,"lookup":"ydb20"}', + ), + ] + * 10 + ), ), # 4 ( @@ -200,37 +231,39 @@ insert into myyds.`{output_topic}` select Unwrap(Yson::SerializeJson(Yson::From(TableRow()))) from $enriched; ''', - [ - ( - '{"id":1,"ts":"20240701T113344","ev_type":"foo1","user":2}', - '{"id":1,"ts":"11:33:44","uid":2,"user_id":2,"name":"Petr","age":25}', - ), - ( - '{"id":2,"ts":"20240701T112233","ev_type":"foo2","user":1}', - '{"id":2,"ts":"11:22:33","uid":1,"user_id":1,"name":"Anya","age":15}', - ), - ( - '{"id":3,"ts":"20240701T113355","ev_type":"foo3","user":100}', - '{"id":3,"ts":"11:33:55","uid":null,"user_id":100,"name":null,"age":null}', - ), - ( - '{"id":4,"ts":"20240701T113356","ev_type":"foo4","user":3}', - '{"id":4,"ts":"11:33:56","uid":3,"user_id":3,"name":"Masha","age":17}', - ), - ( - '{"id":5,"ts":"20240701T113357","ev_type":"foo5","user":3}', - '{"id":5,"ts":"11:33:57","uid":3,"user_id":3,"name":"Masha","age":17}', - ), - ( - '{"id":6,"ts":"20240701T112238","ev_type":"foo6","user":1}', - '{"id":6,"ts":"11:22:38","uid":1,"user_id":1,"name":"Anya","age":15}', - ), - ( - '{"id":7,"ts":"20240701T113349","ev_type":"foo7","user":2}', - '{"id":7,"ts":"11:33:49","uid":2,"user_id":2,"name":"Petr","age":25}', - ), - ] - * 1000, + ResequenceId( + [ + ( + '{"id":1,"ts":"20240701T113344","ev_type":"foo1","user":2}', + '{"id":1,"ts":"11:33:44","uid":2,"user_id":2,"name":"Petr","age":25}', + ), + ( + '{"id":2,"ts":"20240701T112233","ev_type":"foo2","user":1}', + '{"id":2,"ts":"11:22:33","uid":1,"user_id":1,"name":"Anya","age":15}', + ), + ( + '{"id":3,"ts":"20240701T113355","ev_type":"foo3","user":100}', + '{"id":3,"ts":"11:33:55","uid":null,"user_id":100,"name":null,"age":null}', + ), + ( + '{"id":4,"ts":"20240701T113356","ev_type":"foo4","user":3}', + '{"id":4,"ts":"11:33:56","uid":3,"user_id":3,"name":"Masha","age":17}', + ), + ( + '{"id":5,"ts":"20240701T113357","ev_type":"foo5","user":3}', + '{"id":5,"ts":"11:33:57","uid":3,"user_id":3,"name":"Masha","age":17}', + ), + ( + '{"id":6,"ts":"20240701T112238","ev_type":"foo6","user":1}', + '{"id":6,"ts":"11:22:38","uid":1,"user_id":1,"name":"Anya","age":15}', + ), + ( + '{"id":7,"ts":"20240701T113349","ev_type":"foo7","user":2}', + '{"id":7,"ts":"11:33:49","uid":2,"user_id":2,"name":"Petr","age":25}', + ), + ] + * 1000 + ), ), # 5 ( @@ -334,12 +367,23 @@ def test_simple(self, kikimr, fq_client: FederatedQueryClient, settings: Setting @yq_v1 @pytest.mark.parametrize("mvp_external_ydb_endpoint", [{"endpoint": "tests-fq-generic-ydb:2136"}], indirect=True) @pytest.mark.parametrize("fq_client", [{"folder_id": "my_folder_slj"}], indirect=True) - @pytest.mark.parametrize("streamlookup", [False, True]) + @pytest.mark.parametrize("partitions_count", [1, 3]) + @pytest.mark.parametrize("streamlookup", [False, True] if DEBUG else [True]) @pytest.mark.parametrize("testcase", [*range(len(TESTCASES))]) def test_streamlookup( - self, kikimr, testcase, streamlookup, fq_client: FederatedQueryClient, settings: Settings, yq_version + self, + kikimr, + testcase, + streamlookup, + partitions_count, + fq_client: FederatedQueryClient, + settings: Settings, + yq_version, ): - self.init_topics(f"pq_yq_streaming_test_lookup_{streamlookup}{testcase}_{yq_version}") + self.init_topics( + f"pq_yq_str_lookup_{partitions_count}{streamlookup}{testcase}_{yq_version}", + partitions_count=partitions_count, + ) fq_client.create_yds_connection("myyds", os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT")) table_name = 'join_table' @@ -359,7 +403,7 @@ def test_streamlookup( ) query_id = fq_client.create_query( - f"streamlookup_{streamlookup}{testcase}", sql, type=fq.QueryContent.QueryType.STREAMING + f"streamlookup_{partitions_count}{streamlookup}{testcase}", sql, type=fq.QueryContent.QueryType.STREAMING ).result.query_id fq_client.wait_query_status(query_id, fq.QueryMeta.RUNNING) kikimr.compute_plane.wait_zero_checkpoint(query_id) @@ -375,10 +419,9 @@ def test_streamlookup( print(streamlookup, testcase, file=sys.stderr) print(sql, file=sys.stderr) print(*zip(messages, read_data), file=sys.stderr, sep="\n") - for r, exp in zip(read_data, messages): - r = json.loads(r) - exp = json.loads(exp[1]) - assert r == exp + read_data_ctr = Counter(map(freeze, map(json.loads, read_data))) + messages_ctr = Counter(map(freeze, map(json.loads, map(itemgetter(1), messages)))) + assert read_data_ctr == messages_ctr fq_client.abort_query(query_id) fq_client.wait_query(query_id) From b3afe0630b3f14fd547a945efb7a8fc62638ab91 Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Fri, 6 Sep 2024 14:15:47 +0300 Subject: [PATCH 24/56] single node scheduler has been added (#8445) (#8838) --- ydb/core/fq/libs/actors/nodes_manager.cpp | 200 ++++++++++++------ .../providers/dq/actors/executer_actor.cpp | 2 + .../yql/providers/dq/api/protos/dqs.proto | 1 + .../providers/dq/common/yql_dq_settings.cpp | 1 + .../yql/providers/dq/common/yql_dq_settings.h | 2 + ydb/tests/fq/yds/test_3_selects.py | 1 + 6 files changed, 139 insertions(+), 68 deletions(-) diff --git a/ydb/core/fq/libs/actors/nodes_manager.cpp b/ydb/core/fq/libs/actors/nodes_manager.cpp index b422fc77c470..83622fde3a80 100644 --- a/ydb/core/fq/libs/actors/nodes_manager.cpp +++ b/ydb/core/fq/libs/actors/nodes_manager.cpp @@ -16,6 +16,10 @@ #include #include +#include + +#include + #define LOG_E(stream) \ LOG_ERROR_S(*TlsActivationContext, NKikimrServices::YQL_NODES_MANAGER, stream) @@ -86,93 +90,148 @@ class TNodesManagerActor : public NActors::TActorBootstrappedGetCounter("EvAllocateWorkersRequest", true)->Inc(); - const auto &rec = ev->Get()->Record; - const auto count = rec.GetCount(); - - auto req = MakeHolder(); + const auto &request = ev->Get()->Record; + const auto count = request.GetCount(); + auto scheduler = request.GetScheduler(); + auto response = MakeHolder(); if (count == 0) { - auto& error = *req->Record.MutableError(); + auto& error = *response->Record.MutableError(); error.SetStatusCode(NYql::NDqProto::StatusIds::BAD_REQUEST); error.SetMessage("Incorrect request - 0 nodes requested"); + } else if (!scheduler) { + ScheduleUniformly(request, response); } else { - auto resourceId = rec.GetResourceId(); - if (!resourceId) { - resourceId = (ui64(++ResourceIdPart) << 32) | SelfId().NodeId(); + try { + auto schedulerSettings = NSc::TValue::FromJsonThrow(scheduler); + auto schedulerType = schedulerSettings["type"].GetString(); + if (schedulerType == "single_node") { + ScheduleOnSingleNode(request, response); + } else { + auto& error = *response->Record.MutableError(); + error.SetStatusCode(NYql::NDqProto::StatusIds::BAD_REQUEST); + error.SetMessage(TStringBuilder{} << "Unknown scheduler type: " << schedulerType << ", settings: " << scheduler); + } + } catch (...) { + auto& error = *response->Record.MutableError(); + error.SetStatusCode(NYql::NDqProto::StatusIds::BAD_REQUEST); + error.SetMessage(TStringBuilder{} << "Error choosing scheduler. Invalid settings: " << scheduler << ", error: " << CurrentExceptionMessage()); } + } + LOG_D("TEvAllocateWorkersResponse " << response->Record.DebugString()); - bool placementFailure = false; - ui64 memoryLimit = AtomicGet(WorkerManagerCounters.MkqlMemoryLimit->GetAtomic()); - ui64 memoryAllocated = AtomicGet(WorkerManagerCounters.MkqlMemoryAllocated->GetAtomic()); - TVector nodes; - for (ui32 i = 0; i < count; ++i) { - ui64 totalMemoryLimit = 0; - if (rec.TaskSize() > i) { - totalMemoryLimit = rec.GetTask(i).GetInitialTaskMemoryLimit(); - } - if (totalMemoryLimit == 0) { - totalMemoryLimit = MkqlInitialMemoryLimit; - } - TPeer node = {SelfId().NodeId(), InstanceId + "," + HostName(), 0, 0, 0, DataCenter}; - bool selfPlacement = true; - if (!Peers.empty()) { - auto FirstPeer = NextPeer; - while (true) { - Y_ABORT_UNLESS(NextPeer < Peers.size()); - auto& nextNode = Peers[NextPeer]; - - if (++NextPeer >= Peers.size()) { - NextPeer = 0; - } + Send(ev->Sender, response.Release()); + } - if ( (!UseDataCenter || DataCenter.empty() || nextNode.DataCenter.empty() || DataCenter == nextNode.DataCenter) // non empty DC must match - && ( nextNode.MemoryLimit == 0 // memory is NOT limited - || nextNode.MemoryLimit >= nextNode.MemoryAllocated + totalMemoryLimit) // or enough - ) { - // adjust allocated size to place next tasks correctly, will be reset after next health check - nextNode.MemoryAllocated += totalMemoryLimit; - if (nextNode.NodeId == SelfId().NodeId()) { - // eventually synced self allocation info - memoryAllocated += totalMemoryLimit; - } - node = nextNode; - selfPlacement = false; - break; - } + void ScheduleUniformly(const NYql::NDqProto::TAllocateWorkersRequest& request, THolder& response) { + const auto count = request.GetCount(); + auto resourceId = request.GetResourceId(); + if (!resourceId) { + resourceId = (ui64(++ResourceIdPart) << 32) | SelfId().NodeId(); + } + + bool placementFailure = false; + ui64 memoryLimit = AtomicGet(WorkerManagerCounters.MkqlMemoryLimit->GetAtomic()); + ui64 memoryAllocated = AtomicGet(WorkerManagerCounters.MkqlMemoryAllocated->GetAtomic()); + TVector nodes; + for (ui32 i = 0; i < count; ++i) { + ui64 totalMemoryLimit = 0; + if (request.TaskSize() > i) { + totalMemoryLimit = request.GetTask(i).GetInitialTaskMemoryLimit(); + } + if (totalMemoryLimit == 0) { + totalMemoryLimit = MkqlInitialMemoryLimit; + } + TPeer node = {SelfId().NodeId(), InstanceId + "," + HostName(), 0, 0, 0, DataCenter}; + bool selfPlacement = true; + if (!Peers.empty()) { + auto FirstPeer = NextPeer; + while (true) { + Y_ABORT_UNLESS(NextPeer < Peers.size()); + auto& nextNode = Peers[NextPeer]; + + if (++NextPeer >= Peers.size()) { + NextPeer = 0; + } - if (NextPeer == FirstPeer) { // we closed loop w/o success, fallback to self placement then - break; + if ( (!UseDataCenter || DataCenter.empty() || nextNode.DataCenter.empty() || DataCenter == nextNode.DataCenter) // non empty DC must match + && ( nextNode.MemoryLimit == 0 // memory is NOT limited + || nextNode.MemoryLimit >= nextNode.MemoryAllocated + totalMemoryLimit) // or enough + ) { + // adjust allocated size to place next tasks correctly, will be reset after next health check + nextNode.MemoryAllocated += totalMemoryLimit; + if (nextNode.NodeId == SelfId().NodeId()) { + // eventually synced self allocation info + memoryAllocated += totalMemoryLimit; } + node = nextNode; + selfPlacement = false; + break; } - } - if (selfPlacement) { - if (memoryLimit == 0 || memoryLimit >= memoryAllocated + totalMemoryLimit) { - memoryAllocated += totalMemoryLimit; - } else { - placementFailure = true; - auto& error = *req->Record.MutableError(); - error.SetStatusCode(NYql::NDqProto::StatusIds::CLUSTER_OVERLOADED); - error.SetMessage("Not enough free memory in the cluster"); + + if (NextPeer == FirstPeer) { // we closed loop w/o success, fallback to self placement then break; } } - nodes.push_back(node); } - - if (!placementFailure) { - req->Record.ClearError(); - auto& group = *req->Record.MutableNodes(); - group.SetResourceId(resourceId); - for (const auto& node : nodes) { - auto* worker = group.AddWorker(); - *worker->MutableGuid() = node.InstanceId; - worker->SetNodeId(node.NodeId); + if (selfPlacement) { + if (memoryLimit == 0 || memoryLimit >= memoryAllocated + totalMemoryLimit) { + memoryAllocated += totalMemoryLimit; + } else { + placementFailure = true; + auto& error = *response->Record.MutableError(); + error.SetStatusCode(NYql::NDqProto::StatusIds::CLUSTER_OVERLOADED); + error.SetMessage("Not enough free memory in the cluster"); + break; } } + nodes.push_back(node); } - LOG_D("TEvAllocateWorkersResponse " << req->Record.DebugString()); - Send(ev->Sender, req.Release()); + if (!placementFailure) { + response->Record.ClearError(); + auto& group = *response->Record.MutableNodes(); + group.SetResourceId(resourceId); + for (const auto& node : nodes) { + auto* worker = group.AddWorker(); + *worker->MutableGuid() = node.InstanceId; + worker->SetNodeId(node.NodeId); + } + } + } + + void ScheduleOnSingleNode(const NYql::NDqProto::TAllocateWorkersRequest& request, THolder& response) { + const auto count = request.GetCount(); + auto resourceId = request.GetResourceId(); + if (!resourceId) { + resourceId = (ui64(++ResourceIdPart) << 32) | SelfId().NodeId(); + } + + if (Peers.size() != SingleNodeScheduler.NodeOrder.size()) { + SingleNodeScheduler.NodeOrder.clear(); + for (ui32 i = 0; i < Peers.size(); i++) { + SingleNodeScheduler.NodeOrder.push_back(i); + } + std::shuffle(SingleNodeScheduler.NodeOrder.begin(), SingleNodeScheduler.NodeOrder.end(), std::default_random_engine(TInstant::Now().MicroSeconds())); + } + + TVector nodes; + for (ui32 i = 0; i < count; ++i) { + Y_ABORT_UNLESS(NextPeer < Peers.size()); + nodes.push_back(Peers[SingleNodeScheduler.NodeOrder[NextPeer]]); + } + if (++NextPeer >= Peers.size()) { + NextPeer = 0; + } + + response->Record.ClearError(); + auto& group = *response->Record.MutableNodes(); + group.SetResourceId(resourceId); + for (const auto& node : nodes) { + auto* worker = group.AddWorker(); + *worker->MutableGuid() = node.InstanceId; + worker->SetNodeId(node.NodeId); + } } void Handle(NDqs::TEvFreeWorkersNotify::TPtr&) { @@ -338,6 +397,11 @@ class TNodesManagerActor : public NActors::TActorBootstrapped NodeOrder; + }; + TSingleNodeScheduler SingleNodeScheduler; }; TActorId MakeNodesManagerId() { diff --git a/ydb/library/yql/providers/dq/actors/executer_actor.cpp b/ydb/library/yql/providers/dq/actors/executer_actor.cpp index c57bde386d26..b2b852792e9a 100644 --- a/ydb/library/yql/providers/dq/actors/executer_actor.cpp +++ b/ydb/library/yql/providers/dq/actors/executer_actor.cpp @@ -191,6 +191,7 @@ class TDqExecuter: public TRichActor, NYql::TCounters { const TString computeActorType = Settings->ComputeActorType.Get().GetOrElse("sync"); + const TString scheduler = Settings->Scheduler.Get().GetOrElse({}); auto resourceAllocator = RegisterChild(CreateResourceAllocator( GwmActorId, SelfId(), ControlId, workerCount, @@ -204,6 +205,7 @@ class TDqExecuter: public TRichActor, NYql::TCounters { allocateRequest->Record.SetCreateComputeActor(enableComputeActor); allocateRequest->Record.SetComputeActorType(computeActorType); allocateRequest->Record.SetStatsMode(StatsMode); + allocateRequest->Record.SetScheduler(scheduler); if (enableComputeActor) { ActorIdToProto(ControlId, allocateRequest->Record.MutableResultActorId()); } diff --git a/ydb/library/yql/providers/dq/api/protos/dqs.proto b/ydb/library/yql/providers/dq/api/protos/dqs.proto index 37a7449d305b..8d5b042b9fd0 100644 --- a/ydb/library/yql/providers/dq/api/protos/dqs.proto +++ b/ydb/library/yql/providers/dq/api/protos/dqs.proto @@ -40,6 +40,7 @@ message TAllocateWorkersRequest { uint64 FreeWorkerAfterMs = 14; NYql.NDqProto.EDqStatsMode StatsMode = 16; reserved 17; + string Scheduler = 18; } message TWorkerGroup { diff --git a/ydb/library/yql/providers/dq/common/yql_dq_settings.cpp b/ydb/library/yql/providers/dq/common/yql_dq_settings.cpp index 457474767d03..db574162ea8d 100644 --- a/ydb/library/yql/providers/dq/common/yql_dq_settings.cpp +++ b/ydb/library/yql/providers/dq/common/yql_dq_settings.cpp @@ -98,6 +98,7 @@ TDqConfiguration::TDqConfiguration() { REGISTER_SETTING(*this, _MaxAttachmentsSize); REGISTER_SETTING(*this, DisableCheckpoints); + REGISTER_SETTING(*this, Scheduler); } } // namespace NYql diff --git a/ydb/library/yql/providers/dq/common/yql_dq_settings.h b/ydb/library/yql/providers/dq/common/yql_dq_settings.h index 5823f213e526..8f5d97bc1d8c 100644 --- a/ydb/library/yql/providers/dq/common/yql_dq_settings.h +++ b/ydb/library/yql/providers/dq/common/yql_dq_settings.h @@ -133,6 +133,7 @@ struct TDqSettings { NCommon::TConfSetting _MaxAttachmentsSize; NCommon::TConfSetting DisableCheckpoints; + NCommon::TConfSetting Scheduler; // This options will be passed to executor_actor and worker_actor template @@ -186,6 +187,7 @@ struct TDqSettings { SAVE_SETTING(TaskRunnerStats); SAVE_SETTING(SpillingEngine); SAVE_SETTING(DisableCheckpoints); + SAVE_SETTING(Scheduler); #undef SAVE_SETTING } diff --git a/ydb/tests/fq/yds/test_3_selects.py b/ydb/tests/fq/yds/test_3_selects.py index a969becbab53..c733e8f51823 100644 --- a/ydb/tests/fq/yds/test_3_selects.py +++ b/ydb/tests/fq/yds/test_3_selects.py @@ -15,6 +15,7 @@ class TestSelects(object): @pytest.mark.parametrize("mvp_external_ydb_endpoint", [{"endpoint": os.getenv("YDB_ENDPOINT")}], indirect=True) def test_3_selects(self, client): sql = R''' + pragma dq.Scheduler=@@{"type": "single_node"}@@; SELECT 1 AS SingleColumn; SELECT "A" AS TextColumn; SELECT 11 AS Column1, 22 AS Column2; From 2eda3f8db0dd527791c7d7abaff173c98471b2c0 Mon Sep 17 00:00:00 2001 From: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> Date: Mon, 9 Sep 2024 15:36:59 +0300 Subject: [PATCH 25/56] Fix erroneous finish on TDqInputMergeBlockStreamValue (#8926) Co-authored-by: Andrey Neporada --- ydb/library/yql/dq/runtime/dq_input_producer.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ydb/library/yql/dq/runtime/dq_input_producer.cpp b/ydb/library/yql/dq/runtime/dq_input_producer.cpp index 05c80f11921b..c35e3c45c397 100644 --- a/ydb/library/yql/dq/runtime/dq_input_producer.cpp +++ b/ydb/library/yql/dq/runtime/dq_input_producer.cpp @@ -433,6 +433,10 @@ class TDqInputMergeBlockStreamValue : public TComputationValue= BlockLen_; } + bool IsFinished() const { + return IsFinished_; + } + void NextRow() { Y_DEBUG_ABORT_UNLESS(!IsEmpty()); ++CurrBlockIndex_; @@ -645,7 +649,7 @@ class TDqInputMergeBlockStreamValue : public TComputationValue Date: Thu, 12 Sep 2024 13:40:06 +0200 Subject: [PATCH 26/56] Text of the DescribePath error has been improved (#8868) (#8997) --- .../yql/providers/pq/gateway/native/yql_pq_session.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ydb/library/yql/providers/pq/gateway/native/yql_pq_session.cpp b/ydb/library/yql/providers/pq/gateway/native/yql_pq_session.cpp index f1b8ff15d626..09c50dedb565 100644 --- a/ydb/library/yql/providers/pq/gateway/native/yql_pq_session.cpp +++ b/ydb/library/yql/providers/pq/gateway/native/yql_pq_session.cpp @@ -85,10 +85,10 @@ NPq::NConfigurationManager::TAsyncDescribePathResult TPqSession::DescribePath(co return client->DescribePath(path); } - return GetYdbPqClient(cluster, database, *config, credentialsProviderFactory).DescribeTopic(path).Apply([cluster, path](const NYdb::NTopic::TAsyncDescribeTopicResult& describeTopicResultFuture) { + return GetYdbPqClient(cluster, database, *config, credentialsProviderFactory).DescribeTopic(path).Apply([cluster, path, database](const NYdb::NTopic::TAsyncDescribeTopicResult& describeTopicResultFuture) { const NYdb::NTopic::TDescribeTopicResult& describeTopicResult = describeTopicResultFuture.GetValue(); if (!describeTopicResult.IsSuccess()) { - throw yexception() << "Failed to describe topic `" << cluster << "`.`" << path << "`: " << describeTopicResult.GetIssues().ToString(); + throw yexception() << "Failed to describe topic `" << cluster << "`.`" << path << "` in the database `" << database << "`: " << describeTopicResult.GetIssues().ToString(); } NPq::NConfigurationManager::TTopicDescription desc(path); desc.PartitionsCount = describeTopicResult.GetTopicDescription().GetTotalPartitionsCount(); From 22ef31dc7b203b1ad61f03d796d10dac876ef4fd Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Fri, 13 Sep 2024 09:17:35 +0200 Subject: [PATCH 27/56] AsyncDecoding and HttpGateway have been fixed (#9118) (#9141) --- .../common/http_gateway/yql_http_gateway.cpp | 25 +++++++++++-------- .../providers/s3/actors/yql_s3_read_actor.cpp | 13 +++++----- .../yql/providers/s3/common/source_context.h | 17 +------------ 3 files changed, 22 insertions(+), 33 deletions(-) diff --git a/ydb/library/yql/providers/common/http_gateway/yql_http_gateway.cpp b/ydb/library/yql/providers/common/http_gateway/yql_http_gateway.cpp index 9d793cd403da..a10a3f62fa7d 100644 --- a/ydb/library/yql/providers/common/http_gateway/yql_http_gateway.cpp +++ b/ydb/library/yql/providers/common/http_gateway/yql_http_gateway.cpp @@ -746,7 +746,7 @@ friend class IHTTPGateway; } size_t FillHandlers() { - const std::unique_lock lock(Sync); + const std::unique_lock lock(SyncRef()); for (auto it = Streams.cbegin(); Streams.cend() != it;) { if (const auto& stream = it->lock()) { const auto streamHandle = stream->GetHandle(); @@ -795,7 +795,7 @@ friend class IHTTPGateway; TEasyCurl::TPtr easy; long httpResponseCode = 0L; { - const std::unique_lock lock(Sync); + const std::unique_lock lock(SyncRef()); if (const auto it = Allocated.find(handle); Allocated.cend() != it) { easy = std::move(it->second); TString codeLabel; @@ -847,7 +847,7 @@ friend class IHTTPGateway; void Fail(CURLMcode result) { std::stack works; { - const std::unique_lock lock(Sync); + const std::unique_lock lock(SyncRef()); for (auto& item : Allocated) { works.emplace(std::move(item.second)); @@ -868,7 +868,7 @@ friend class IHTTPGateway; void Upload(TString url, THeaders headers, TString body, TOnResult callback, bool put, TRetryPolicy::TPtr retryPolicy) final { Rps->Inc(); - const std::unique_lock lock(Sync); + const std::unique_lock lock(SyncRef()); auto easy = TEasyCurlBuffer::Make(InFlight, DownloadedBytes, UploadedBytes, std::move(url), put ? TEasyCurl::EMethod::PUT : TEasyCurl::EMethod::POST, std::move(body), std::move(headers), 0U, 0U, std::move(callback), retryPolicy ? retryPolicy->CreateRetryState() : nullptr, InitConfig, DnsGateway.GetDNSCurlList()); Await.emplace(std::move(easy)); Wakeup(0U); @@ -877,7 +877,7 @@ friend class IHTTPGateway; void Delete(TString url, THeaders headers, TOnResult callback, TRetryPolicy::TPtr retryPolicy) final { Rps->Inc(); - const std::unique_lock lock(Sync); + const std::unique_lock lock(SyncRef()); auto easy = TEasyCurlBuffer::Make(InFlight, DownloadedBytes, UploadedBytes, std::move(url), TEasyCurl::EMethod::DELETE, 0, std::move(headers), 0U, 0U, std::move(callback), retryPolicy ? retryPolicy->CreateRetryState() : nullptr, InitConfig, DnsGateway.GetDNSCurlList()); Await.emplace(std::move(easy)); Wakeup(0U); @@ -898,7 +898,7 @@ friend class IHTTPGateway; callback(TResult(CURLE_OK, TIssues{error})); return; } - const std::unique_lock lock(Sync); + const std::unique_lock lock(SyncRef()); auto easy = TEasyCurlBuffer::Make(InFlight, DownloadedBytes, UploadedBytes, std::move(url), TEasyCurl::EMethod::GET, std::move(data), std::move(headers), offset, sizeLimit, std::move(callback), retryPolicy ? retryPolicy->CreateRetryState() : nullptr, InitConfig, DnsGateway.GetDNSCurlList()); Await.emplace(std::move(easy)); Wakeup(sizeLimit); @@ -915,13 +915,14 @@ friend class IHTTPGateway; const ::NMonitoring::TDynamicCounters::TCounterPtr& inflightCounter) final { auto stream = TEasyCurlStream::Make(InFlightStreams, DownloadedBytes, UploadedBytes, std::move(url), std::move(headers), offset, sizeLimit, std::move(onStart), std::move(onNewData), std::move(onFinish), inflightCounter, InitConfig, DnsGateway.GetDNSCurlList()); - const std::unique_lock lock(Sync); + const std::unique_lock lock(SyncRef()); const auto handle = stream->GetHandle(); TEasyCurlStream::TWeakPtr weak = stream; Streams.emplace_back(stream); Allocated.emplace(handle, std::move(stream)); Wakeup(0ULL); - return [weak](TIssue issue) { + return [weak, sync=Sync](TIssue issue) { + const std::unique_lock lock(*sync); if (const auto& stream = weak.lock()) stream->Cancel(issue); }; @@ -932,7 +933,7 @@ friend class IHTTPGateway; } void OnRetry(TEasyCurlBuffer::TPtr easy) { - const std::unique_lock lock(Sync); + const std::unique_lock lock(SyncRef()); const size_t sizeLimit = easy->GetSizeLimit(); Await.emplace(std::move(easy)); Wakeup(sizeLimit); @@ -950,6 +951,10 @@ friend class IHTTPGateway; } private: + std::mutex& SyncRef() { + return *Sync; + } + CURLM* Handle = nullptr; std::queue Await; @@ -959,7 +964,7 @@ friend class IHTTPGateway; std::unordered_map Allocated; std::priority_queue> Delayed; - std::mutex Sync; + std::shared_ptr Sync = std::make_shared(); std::thread Thread; std::atomic IsStopped = false; diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp index 165b61ba6ab4..f07a7951f6dc 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp @@ -167,7 +167,11 @@ struct TReadSpec { NDB::ColumnsWithTypeAndName CHColumns; std::shared_ptr ArrowSchema; NDB::FormatSettings Settings; - TString Format, Compression; + // It's very important to keep here std::string instead of TString + // because of the cast from TString to std::string is using the MutRef (it isn't thread-safe). + // This behaviour can be found in the getInputFormat call + std::string Format; + TString Compression; ui64 SizeLimit = 0; ui32 BlockLengthPosition = 0; std::vector ColumnReorder; @@ -1373,12 +1377,7 @@ class TS3StreamReadActor : public TActorBootstrapped, public DecodedChunkSizeHist, HttpInflightSize, HttpDataRps, - DeferredQueueSize, - ReadSpec->Format, - ReadSpec->Compression, - ReadSpec->ArrowSchema, - ReadSpec->RowSpec, - ReadSpec->Settings + DeferredQueueSize ); if (!UseRuntimeListing) { diff --git a/ydb/library/yql/providers/s3/common/source_context.h b/ydb/library/yql/providers/s3/common/source_context.h index f418540d2dbd..2ee071e63b25 100644 --- a/ydb/library/yql/providers/s3/common/source_context.h +++ b/ydb/library/yql/providers/s3/common/source_context.h @@ -34,12 +34,7 @@ struct TSourceContext { , NMonitoring::THistogramPtr decodedChunkSizeHist , NMonitoring::TDynamicCounters::TCounterPtr httpInflightSize , NMonitoring::TDynamicCounters::TCounterPtr httpDataRps - , NMonitoring::TDynamicCounters::TCounterPtr deferredQueueSize - , const TString format - , const TString compression - , std::shared_ptr schema - , std::unordered_map> rowTypes - , NDB::FormatSettings settings) + , NMonitoring::TDynamicCounters::TCounterPtr deferredQueueSize) : SourceId(sourceId) , Limit(limit) , ActorSystem(actorSystem) @@ -54,11 +49,6 @@ struct TSourceContext { , HttpInflightSize(httpInflightSize) , HttpDataRps(httpDataRps) , DeferredQueueSize(deferredQueueSize) - , Format(format) - , Compression(compression) - , Schema(schema) - , RowTypes(rowTypes) - , Settings(settings) { } @@ -105,11 +95,6 @@ struct TSourceContext { NMonitoring::TDynamicCounters::TCounterPtr HttpInflightSize; NMonitoring::TDynamicCounters::TCounterPtr HttpDataRps; NMonitoring::TDynamicCounters::TCounterPtr DeferredQueueSize; - const TString Format; - const TString Compression; - std::shared_ptr Schema; - std::unordered_map> RowTypes; - NDB::FormatSettings Settings; private: std::atomic_uint64_t Value; std::mutex Mutex; From 14282424bedf691bb6ed91b63472c8baba9a57ff Mon Sep 17 00:00:00 2001 From: Vitaly Isaev Date: Sun, 15 Sep 2024 22:52:02 +0300 Subject: [PATCH 28/56] Merge #9092 (#9240) --- .../actors/ut/yql_generic_lookup_actor_ut.cpp | 2 +- .../actors/yql_generic_lookup_actor.cpp | 42 +++++++---- .../generic/actors/yql_generic_read_actor.cpp | 72 +++++++++++++++---- .../actors/yql_generic_token_provider.cpp | 41 ++++++----- .../actors/yql_generic_token_provider.h | 5 +- 5 files changed, 114 insertions(+), 48 deletions(-) diff --git a/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp b/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp index c57667366e10..48c5b2951098 100644 --- a/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp +++ b/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp @@ -78,7 +78,7 @@ Y_UNIT_TEST_SUITE(GenericProviderLookupActor) { dsi.Setdatabase("some_db"); dsi.Setuse_tls(true); dsi.set_protocol(::NYql::NConnector::NApi::EProtocol::NATIVE); - auto token = dsi.mutable_credentials() -> mutable_token(); + auto token = dsi.mutable_credentials()->mutable_token(); token->Settype("IAM"); token->Setvalue("TEST_TOKEN"); diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp b/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp index 956a0b7c30bc..18374715fa32 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp +++ b/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp @@ -152,7 +152,14 @@ namespace NYql::NDq { Y_ABORT_UNLESS(response.splits_size() == 1); auto& split = response.splits(0); NConnector::NApi::TReadSplitsRequest readRequest; - *readRequest.mutable_data_source_instance() = GetDataSourceInstanceWithToken(); + + *readRequest.mutable_data_source_instance() = LookupSource.data_source_instance(); + auto error = TokenProvider->MaybeFillToken(*readRequest.mutable_data_source_instance()); + if (error) { + SendError(TActivationContext::ActorSystem(), SelfId(), std::move(error)); + return; + } + *readRequest.add_splits() = split; readRequest.Setformat(NConnector::NApi::TReadSplitsRequest_EFormat::TReadSplitsRequest_EFormat_ARROW_IPC_STREAMING); Connector->ReadSplits(readRequest).Subscribe([actorSystem = TActivationContext::ActorSystem(), selfId = SelfId()](const NConnector::TReadSplitsStreamIteratorAsyncResult& asyncResult) { @@ -194,9 +201,16 @@ namespace NYql::NDq { YQL_CLOG(DEBUG, ProviderGeneric) << "ActorId=" << SelfId() << " Got LookupRequest for " << request.size() << " keys"; Y_ABORT_IF(InProgress); Y_ABORT_IF(request.size() == 0 || request.size() > MaxKeysInRequest); + Request = std::move(request); NConnector::NApi::TListSplitsRequest splitRequest; - *splitRequest.add_selects() = CreateSelect(); + + auto error = FillSelect(*splitRequest.add_selects()); + if (error) { + SendError(TActivationContext::ActorSystem(), SelfId(), std::move(error)); + return; + }; + splitRequest.Setmax_split_count(1); Connector->ListSplits(splitRequest).Subscribe([actorSystem = TActivationContext::ActorSystem(), selfId = SelfId()](const NConnector::TListSplitsStreamIteratorAsyncResult& asyncResult) { auto result = ExtractFromConstFuture(asyncResult); @@ -285,6 +299,12 @@ namespace NYql::NDq { SendError(actorSystem, selfId, NConnector::ErrorFromGRPCStatus(status)); } + static void SendError(NActors::TActorSystem* actorSystem, const NActors::TActorId& selfId, TString error) { + NConnector::NApi::TError dst; + *dst.mutable_message() = error; + SendError(actorSystem, selfId, std::move(dst)); + } + private: enum class EColumnDestination { Key, @@ -314,17 +334,13 @@ namespace NYql::NDq { return result; } - NYql::NConnector::NApi::TDataSourceInstance GetDataSourceInstanceWithToken() const { + TString FillSelect(NConnector::NApi::TSelect& select) { auto dsi = LookupSource.data_source_instance(); - //Note: returned token may be stale and we have no way to check or recover here - //Consider to redesign ICredentialsProvider - TokenProvider->MaybeFillToken(dsi); - return dsi; - } - - NConnector::NApi::TSelect CreateSelect() { - NConnector::NApi::TSelect select; - *select.mutable_data_source_instance() = GetDataSourceInstanceWithToken(); + auto error = TokenProvider->MaybeFillToken(dsi); + if (error) { + return error; + } + *select.mutable_data_source_instance() = dsi; for (ui32 i = 0; i != SelectResultType->GetMembersCount(); ++i) { auto c = select.mutable_what()->add_items()->mutable_column(); @@ -349,7 +365,7 @@ namespace NYql::NDq { *disjunction.mutable_operands()->Add()->mutable_conjunction() = conjunction; } *select.mutable_where()->mutable_filter_typed()->mutable_disjunction() = disjunction; - return select; + return {}; } private: diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp b/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp index fa58560997da..93b3ba116adb 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp +++ b/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp @@ -59,7 +59,14 @@ namespace NYql::NDq { void Bootstrap() { Become(&TGenericReadActor::StateFunc); - InitSplitsListing(); + auto issue = InitSplitsListing(); + if (issue) { + return NotifyComputeActorWithIssue( + TActivationContext::ActorSystem(), + ComputeActorId_, + InputIndex_, + std::move(*issue)); + }; } static constexpr char ActorName[] = "GENERIC_READ_ACTOR"; @@ -79,13 +86,18 @@ namespace NYql::NDq { // ListSplits - void InitSplitsListing() { + TMaybe InitSplitsListing() { YQL_CLOG(DEBUG, ProviderGeneric) << "Start splits listing"; // Prepare request NConnector::NApi::TListSplitsRequest request; NConnector::NApi::TSelect select = Source_.select(); // copy TSelect from source - TokenProvider_->MaybeFillToken(*select.mutable_data_source_instance()); + + auto error = TokenProvider_->MaybeFillToken(*select.mutable_data_source_instance()); + if (error) { + return TIssue(error); + } + *request.mutable_selects()->Add() = std::move(select); // Initialize stream @@ -100,6 +112,8 @@ namespace NYql::NDq { TEvListSplitsIterator>( actorSystem, selfId, computeActorId, inputIndex, future); }); + + return Nothing(); } void Handle(TEvListSplitsIterator::TPtr& ev) { @@ -145,7 +159,16 @@ namespace NYql::NDq { // Server sent EOF, now we are ready to start splits reading if (NConnector::GrpcStatusEndOfStream(status)) { YQL_CLOG(DEBUG, ProviderGeneric) << "Handle :: EvListSplitsFinished :: last message was reached, start data reading"; - return InitSplitsReading(); + auto issue = InitSplitsReading(); + if (issue) { + return NotifyComputeActorWithIssue( + TActivationContext::ActorSystem(), + ComputeActorId_, + InputIndex_, + std::move(*issue)); + } + + return; } // Server temporary failure @@ -163,13 +186,14 @@ namespace NYql::NDq { } // ReadSplits - void InitSplitsReading() { + TMaybe InitSplitsReading() { YQL_CLOG(DEBUG, ProviderGeneric) << "Start splits reading"; if (Splits_.empty()) { YQL_CLOG(WARN, ProviderGeneric) << "Accumulated empty list of splits"; ReadSplitsFinished_ = true; - return NotifyComputeActorWithData(); + NotifyComputeActorWithData(); + return Nothing(); } // Prepare request @@ -177,13 +201,16 @@ namespace NYql::NDq { request.set_format(NConnector::NApi::TReadSplitsRequest::ARROW_IPC_STREAMING); request.mutable_splits()->Reserve(Splits_.size()); - std::for_each( - Splits_.cbegin(), Splits_.cend(), - [&](const NConnector::NApi::TSplit& split) { - NConnector::NApi::TSplit splitCopy = split; - TokenProvider_->MaybeFillToken(*splitCopy.mutable_select()->mutable_data_source_instance()); - *request.mutable_splits()->Add() = std::move(split); - }); + for (const auto& split : Splits_) { + NConnector::NApi::TSplit splitCopy = split; + + auto error = TokenProvider_->MaybeFillToken(*splitCopy.mutable_select()->mutable_data_source_instance()); + if (error) { + return TIssue(std::move(error)); + } + + *request.mutable_splits()->Add() = std::move(splitCopy); + } // Start streaming Client_->ReadSplits(request).Subscribe( @@ -197,6 +224,8 @@ namespace NYql::NDq { TEvReadSplitsIterator>( actorSystem, selfId, computeActorId, inputIndex, future); }); + + return Nothing(); } void Handle(TEvReadSplitsIterator::TPtr& ev) { @@ -308,8 +337,8 @@ namespace NYql::NDq { static void NotifyComputeActorWithError( TActorSystem* actorSystem, - const NActors::TActorId computeActorId, - const ui64 inputIndex, + NActors::TActorId computeActorId, + ui64 inputIndex, const NConnector::NApi::TError& error) { actorSystem->Send(computeActorId, new TEvAsyncInputError( @@ -319,6 +348,19 @@ namespace NYql::NDq { return; } + static void NotifyComputeActorWithIssue( + TActorSystem* actorSystem, + NActors::TActorId computeActorId, + ui64 inputIndex, + TIssue issue) { + actorSystem->Send(computeActorId, + new TEvAsyncInputError( + inputIndex, + TIssues{std::move(issue)}, + NDqProto::StatusIds::StatusCode::StatusIds_StatusCode_INTERNAL_ERROR)); + return; + } + i64 GetAsyncInputData(NKikimr::NMiniKQL::TUnboxedValueBatch& buffer, TMaybe&, bool& finished, diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.cpp b/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.cpp index 55ff0e53db1c..f2651cac0d1c 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.cpp +++ b/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.cpp @@ -1,6 +1,7 @@ #include "yql_generic_token_provider.h" #include +#include namespace NYql::NDq { TGenericTokenProvider::TGenericTokenProvider(const TString& staticIamToken) @@ -9,29 +10,26 @@ namespace NYql::NDq { } TGenericTokenProvider::TGenericTokenProvider( - const TString& serviceAccountId, - const TString& ServiceAccountIdSignature, - const ISecuredServiceAccountCredentialsFactory::TPtr& credentialsFactory) - { + const TString& serviceAccountId, const TString& ServiceAccountIdSignature, + const ISecuredServiceAccountCredentialsFactory::TPtr& credentialsFactory) { Y_ENSURE(!serviceAccountId.Empty(), "No service account provided"); Y_ENSURE(!ServiceAccountIdSignature.Empty(), "No service account signature provided"); Y_ENSURE(credentialsFactory, "CredentialsFactory is not initialized"); auto structuredTokenJSON = - TStructuredTokenBuilder() - .SetServiceAccountIdAuth(serviceAccountId, ServiceAccountIdSignature) - .ToJson(); + TStructuredTokenBuilder().SetServiceAccountIdAuth(serviceAccountId, ServiceAccountIdSignature).ToJson(); Y_ENSURE(structuredTokenJSON, "empty structured token"); - auto credentialsProviderFactory = CreateCredentialsProviderFactoryForStructuredToken(credentialsFactory, structuredTokenJSON, false); + auto credentialsProviderFactory = + CreateCredentialsProviderFactoryForStructuredToken(credentialsFactory, structuredTokenJSON, false); CredentialsProvider_ = credentialsProviderFactory->CreateProvider(); } - void TGenericTokenProvider::MaybeFillToken(NConnector::NApi::TDataSourceInstance& dsi) const { + TString TGenericTokenProvider::MaybeFillToken(NConnector::NApi::TDataSourceInstance& dsi) const { // 1. Don't need tokens if basic auth is set if (dsi.credentials().has_basic()) { - return; + return {}; } *dsi.mutable_credentials()->mutable_token()->mutable_type() = "IAM"; @@ -39,29 +37,36 @@ namespace NYql::NDq { // 2. If static IAM-token has been provided, use it if (!StaticIAMToken_.empty()) { *dsi.mutable_credentials()->mutable_token()->mutable_value() = StaticIAMToken_; - return; + return {}; } // 3. Otherwise use credentials provider to get token Y_ENSURE(CredentialsProvider_, "CredentialsProvider is not initialized"); - auto iamToken = CredentialsProvider_->GetAuthInfo(); + TString iamToken; + try { + iamToken = CredentialsProvider_->GetAuthInfo(); + } catch (const std::exception& e) { + YQL_CLOG(ERROR, ProviderGeneric) << "MaybeFillToken: " << e.what(); + return TString(e.what()); + } + Y_ENSURE(iamToken, "CredentialsProvider returned empty IAM token"); *dsi.mutable_credentials()->mutable_token()->mutable_value() = std::move(iamToken); + return {}; } TGenericTokenProvider::TPtr - CreateGenericTokenProvider( - const TString& staticIamToken, - const TString& serviceAccountId, const TString& serviceAccountIdSignature, - const ISecuredServiceAccountCredentialsFactory::TPtr& credentialsFactory) - { + CreateGenericTokenProvider(const TString& staticIamToken, const TString& serviceAccountId, + const TString& serviceAccountIdSignature, + const ISecuredServiceAccountCredentialsFactory::TPtr& credentialsFactory) { if (!staticIamToken.Empty()) { return std::make_unique(staticIamToken); } if (!serviceAccountId.Empty()) { - return std::make_unique(serviceAccountId, serviceAccountIdSignature, credentialsFactory); + return std::make_unique(serviceAccountId, serviceAccountIdSignature, + credentialsFactory); } return std::make_unique(); } diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.h b/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.h index c5f4d3331def..6ff0d1fd578d 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.h +++ b/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace NYql::NDq { // When accessing external data sources using authentication via tokens, @@ -19,7 +20,9 @@ namespace NYql::NDq { const TString& ServiceAccountIdSignature, const ISecuredServiceAccountCredentialsFactory::TPtr& credentialsFactory); - void MaybeFillToken(NConnector::NApi::TDataSourceInstance& dsi) const; + // MaybeFillToken sets IAM-token within DataSourceInstance. + // Returns string containing error, if it happened. + TString MaybeFillToken(NConnector::NApi::TDataSourceInstance& dsi) const; private: TString StaticIAMToken_; From ae0f0309459665d01250bb91922092977df66427 Mon Sep 17 00:00:00 2001 From: yumkam Date: Tue, 17 Sep 2024 23:05:59 +0300 Subject: [PATCH 29/56] streamlookup: fix for left table name is prefix of right table (backport #9177) (#9218) --- .../input_transforms/dq_input_transform_lookup.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp index 9f2f52f17bb8..77b53fff73fa 100644 --- a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp +++ b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp @@ -352,15 +352,17 @@ TOutputRowColumnOrder CategorizeOutputRowItems( size_t idxRightPayload = 0; for (ui32 i = 0; i != type->GetMembersCount(); ++i) { const auto prefixedName = type->GetMemberName(i); - if (prefixedName.starts_with(leftLabel)) { - Y_ABORT_IF(prefixedName.length() == leftLabel.length()); + if (prefixedName.starts_with(leftLabel) && + prefixedName.length() > leftLabel.length() && + prefixedName[leftLabel.length()] == '.') { const auto name = prefixedName.SubStr(leftLabel.length() + 1); //skip prefix and dot result[i] = { leftJoinColumns.contains(name) ? EOutputRowItemSource::InputKey : EOutputRowItemSource::InputOther, idxLeft++ }; - } else if (prefixedName.starts_with(rightLabel)) { - Y_ABORT_IF(prefixedName.length() == rightLabel.length()); + } else if (prefixedName.starts_with(rightLabel) && + prefixedName.length() > rightLabel.length() && + prefixedName[rightLabel.length()] == '.') { const auto name = prefixedName.SubStr(rightLabel.length() + 1); //skip prefix and dot //presume that indexes in LookupKey, LookupOther has the same relative position as in OutputRow if (rightJoinColumns.contains(name)) { From d40fa7292fe81bae931baf7c9abf8ac168e28623 Mon Sep 17 00:00:00 2001 From: uzhastik Date: Wed, 18 Sep 2024 20:50:50 +0300 Subject: [PATCH 30/56] Fix some bugs in jsonpath (#9146) (#9442) Co-authored-by: Vadim Averin --- ydb/library/yql/minikql/jsonpath/value.cpp | 12 ++++++------ ydb/library/yql/udfs/common/yson2/yson2_udf.cpp | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ydb/library/yql/minikql/jsonpath/value.cpp b/ydb/library/yql/minikql/jsonpath/value.cpp index ad77506d8176..e42e5f13bd44 100644 --- a/ydb/library/yql/minikql/jsonpath/value.cpp +++ b/ydb/library/yql/minikql/jsonpath/value.cpp @@ -277,7 +277,7 @@ ui32 TValue::GetSize() const { return value->GetDictLength(); } } else { - Y_ABORT("Unexpected variant case in GetString"); + Y_ABORT("Unexpected variant case in GetSize"); } } @@ -287,9 +287,9 @@ TValue TValue::GetElement(ui32 index) const { if (const auto* value = std::get_if(&Value)) { return TValue(value->GetElement(index)); } else if (const auto* value = std::get_if(&Value)) { - return TValue(value->GetElement(index)); + return TValue(value->Lookup(TUnboxedValuePod(index))); } else { - Y_ABORT("Unexpected variant case in GetString"); + Y_ABORT("Unexpected variant case in GetElement"); } } @@ -304,7 +304,7 @@ TArrayIterator TValue::GetArrayIterator() const { } return TArrayIterator(value->GetListIterator()); } else { - Y_ABORT("Unexpected variant case in GetString"); + Y_ABORT("Unexpected variant case in GetArrayIterator"); } } @@ -332,7 +332,7 @@ TMaybe TValue::Lookup(const TStringBuf key) const { return Nothing(); } } else { - Y_ABORT("Unexpected variant case in GetString"); + Y_ABORT("Unexpected variant case in Lookup"); } } @@ -347,7 +347,7 @@ TObjectIterator TValue::GetObjectIterator() const { } return TObjectIterator(value->GetDictIterator()); } else { - Y_ABORT("Unexpected variant case in GetString"); + Y_ABORT("Unexpected variant case in GetObjectIterator"); } } diff --git a/ydb/library/yql/udfs/common/yson2/yson2_udf.cpp b/ydb/library/yql/udfs/common/yson2/yson2_udf.cpp index 45ff1bb8ffae..b97304c55a2f 100644 --- a/ydb/library/yql/udfs/common/yson2/yson2_udf.cpp +++ b/ydb/library/yql/udfs/common/yson2/yson2_udf.cpp @@ -313,9 +313,9 @@ TUnboxedValuePod LookupImpl(TUnboxedValuePod dict, const TUnboxedValuePod key, c if (index < 0) index += size; if constexpr (Converter != nullptr) { - return Converter(dict.GetElement(index).Release(), valueBuilder, pos); + return Converter(dict.Lookup(TUnboxedValuePod(index)).Release(), valueBuilder, pos); } - return dict.GetElement(index).Release(); + return dict.Lookup(TUnboxedValuePod(index)).Release(); } } } From c006f27800470d23cae7afc10067c08c13e94c76 Mon Sep 17 00:00:00 2001 From: Hor911 Date: Fri, 20 Sep 2024 08:32:53 +0300 Subject: [PATCH 31/56] Timeline support (#9533) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Олег <150132506+iddqdex@users.noreply.github.com> --- ydb/core/fq/libs/compute/common/utils.cpp | 45 +- ydb/core/fq/libs/compute/common/utils.h | 6 +- ydb/core/fq/libs/compute/common/ya.make | 1 + ydb/core/fq/libs/config/protos/common.proto | 2 + .../internal/task_ping.cpp | 4 + .../proto/yq_internal.proto | 1 + .../fq/libs/control_plane_storage/util.cpp | 1 + .../ydb_control_plane_storage_queries.cpp | 6 + ydb/core/fq/libs/protos/fq_private.proto | 1 + ydb/public/api/protos/draft/fq.proto | 5 + .../lib/ydb_cli/commands/ydb_benchmark.cpp | 11 +- ydb/public/lib/ydb_cli/common/plan2svg.cpp | 1646 +++++++++++++++++ ydb/public/lib/ydb_cli/common/plan2svg.h | 264 +++ ydb/public/lib/ydb_cli/common/ya.make | 1 + ydb/tests/fq/plans/test_stats_mode.py | 6 +- ydb/tests/olap/lib/ydb_cli.py | 6 +- ydb/tests/olap/load/conftest.py | 2 + ydb/tests/tools/fq_runner/kikimr_utils.py | 1 + ydb/tests/tools/kqprun/src/kqp_runner.cpp | 2 +- 19 files changed, 1995 insertions(+), 16 deletions(-) create mode 100644 ydb/public/lib/ydb_cli/common/plan2svg.cpp create mode 100644 ydb/public/lib/ydb_cli/common/plan2svg.h diff --git a/ydb/core/fq/libs/compute/common/utils.cpp b/ydb/core/fq/libs/compute/common/utils.cpp index a7fc8cd81aa1..40c06436fb4c 100644 --- a/ydb/core/fq/libs/compute/common/utils.cpp +++ b/ydb/core/fq/libs/compute/common/utils.cpp @@ -1,4 +1,5 @@ #include "utils.h" +#include #include #include @@ -430,7 +431,7 @@ void EnumeratePlans(NYson::TYsonWriter& writer, NJson::TJsonValue& value, ui32& } } -TString GetV1StatFromV2Plan(const TString& plan, double* cpuUsage) { +TString GetV1StatFromV2Plan(const TString& plan, double* cpuUsage, TString* timeline, ui64 maxTimelineSize) { TStringStream out; NYson::TYsonWriter writer(&out); writer.OnBeginMap(); @@ -471,6 +472,24 @@ TString GetV1StatFromV2Plan(const TString& plan, double* cpuUsage) { } } } + if (timeline) { + TPlanVisualizer planViz; + planViz.LoadPlans(plan); + *timeline = planViz.PrintSvgSafe(); + if (maxTimelineSize && timeline->size() > maxTimelineSize) { + TStringBuilder builder; + builder + << "" << Endl + << " There is nothing wrong with the request." << Endl + << " Unfortunately, image size " << timeline->size() << " is too large." << Endl + << " It exceeds limit of " << maxTimelineSize << " and was discarded" << Endl + << "" << Endl; + *timeline = builder; + } + // remove json "timeline" field after migration + writer.OnKeyedItem("timeline"); + writer.OnStringScalar(*timeline); + } writer.OnEndMap(); return NJson2Yson::ConvertYson2Json(out.Str()); } @@ -1145,7 +1164,7 @@ struct TNoneStatProcessor : IPlanStatProcessor { return plan; } - TString GetQueryStat(const TString&, double& cpuUsage) override { + TString GetQueryStat(const TString&, double& cpuUsage, TString*, ui64) override { cpuUsage = 0.0; return ""; } @@ -1178,8 +1197,8 @@ struct TPlanStatProcessor : IPlanStatProcessor { return plan; } - TString GetQueryStat(const TString& plan, double& cpuUsage) override { - return GetV1StatFromV2Plan(plan, &cpuUsage); + TString GetQueryStat(const TString& plan, double& cpuUsage, TString* timeline, ui64 maxtimelineSize) override { + return GetV1StatFromV2Plan(plan, &cpuUsage, timeline, maxtimelineSize); } TPublicStat GetPublicStat(const TString& stat) override { @@ -1210,8 +1229,8 @@ struct TProfileStatProcessor : TPlanStatProcessor { }; struct TProdStatProcessor : TFullStatProcessor { - TString GetQueryStat(const TString& plan, double& cpuUsage) override { - return GetPrettyStatistics(GetV1StatFromV2Plan(plan, &cpuUsage)); + TString GetQueryStat(const TString& plan, double& cpuUsage, TString* timeline, ui64 maxtimelineSize) override { + return GetPrettyStatistics(GetV1StatFromV2Plan(plan, &cpuUsage, timeline, maxtimelineSize)); } }; @@ -1229,8 +1248,12 @@ std::unique_ptr CreateStatProcessor(const TString& statViewN PingTaskRequestBuilder::PingTaskRequestBuilder(const NConfig::TCommonConfig& commonConfig, std::unique_ptr&& processor) : Compressor(commonConfig.GetQueryArtifactsCompressionMethod(), commonConfig.GetQueryArtifactsCompressionMinSize()) - , Processor(std::move(processor)) -{} + , Processor(std::move(processor)), ShowQueryTimeline(commonConfig.GetShowQueryTimeline()), MaxQueryTimelineSize(commonConfig.GetMaxQueryTimelineSize()) +{ + if (!MaxQueryTimelineSize) { + MaxQueryTimelineSize = 200_KB; + } +} Fq::Private::PingTaskRequest PingTaskRequestBuilder::Build( const Ydb::TableStats::QueryStats& queryStats, @@ -1294,9 +1317,13 @@ Fq::Private::PingTaskRequest PingTaskRequestBuilder::Build(const TString& queryP CpuUsage = 0.0; try { - auto stat = Processor->GetQueryStat(plan, CpuUsage); + TString timeline; + auto stat = Processor->GetQueryStat(plan, CpuUsage, ShowQueryTimeline ? &timeline : nullptr, MaxQueryTimelineSize); pingTaskRequest.set_statistics(stat); pingTaskRequest.set_dump_raw_statistics(true); + if (timeline) { + pingTaskRequest.set_timeline(timeline); + } auto flatStat = Processor->GetFlatStat(plan); flatStat["CompilationTimeUs"] = compilationTimeUs; flatStat["ComputeTimeUs"] = computeTimeUs; diff --git a/ydb/core/fq/libs/compute/common/utils.h b/ydb/core/fq/libs/compute/common/utils.h index 8fd7d56419bc..efd11787838d 100644 --- a/ydb/core/fq/libs/compute/common/utils.h +++ b/ydb/core/fq/libs/compute/common/utils.h @@ -28,7 +28,7 @@ inline std::shared_ptr CreateNewTableClient(const TS tableSettings); } -TString GetV1StatFromV2Plan(const TString& plan, double* cpuUsage = nullptr); +TString GetV1StatFromV2Plan(const TString& plan, double* cpuUsage = nullptr, TString* timeline = nullptr, ui64 maxTimelineSize = 0); TString GetV1StatFromV2PlanV2(const TString& plan); TString GetPrettyStatistics(const TString& statistics); THashMap AggregateStats(TStringBuf plan); @@ -55,7 +55,7 @@ struct IPlanStatProcessor { virtual Ydb::Query::StatsMode GetStatsMode() = 0; virtual TString ConvertPlan(const TString& plan) = 0; virtual TString GetPlanVisualization(const TString& plan) = 0; - virtual TString GetQueryStat(const TString& plan, double& cpuUsage) = 0; + virtual TString GetQueryStat(const TString& plan, double& cpuUsage, TString* timeline, ui64 maxtimelineSize) = 0; virtual TPublicStat GetPublicStat(const TString& stat) = 0; virtual THashMap GetFlatStat(TStringBuf plan) = 0; }; @@ -79,6 +79,8 @@ class PingTaskRequestBuilder { private: const TCompressor Compressor; std::unique_ptr Processor; + bool ShowQueryTimeline = false; + ui64 MaxQueryTimelineSize = 0; }; TString GetStatViewName(const ::NFq::TRunActorParams& params); diff --git a/ydb/core/fq/libs/compute/common/ya.make b/ydb/core/fq/libs/compute/common/ya.make index cf9a359f840b..efdb54097732 100644 --- a/ydb/core/fq/libs/compute/common/ya.make +++ b/ydb/core/fq/libs/compute/common/ya.make @@ -19,6 +19,7 @@ PEERDIR( ydb/library/yql/providers/generic/connector/api/service/protos ydb/library/yql/providers/generic/connector/libcpp ydb/library/yql/providers/s3/actors_factory + ydb/public/lib/ydb_cli/common ) YQL_LAST_ABI_VERSION() diff --git a/ydb/core/fq/libs/config/protos/common.proto b/ydb/core/fq/libs/config/protos/common.proto index 297436bc2b6f..1b3da64d754b 100644 --- a/ydb/core/fq/libs/config/protos/common.proto +++ b/ydb/core/fq/libs/config/protos/common.proto @@ -29,4 +29,6 @@ message TCommonConfig { bool KeepInternalErrors = 13; bool UseNativeProtocolForClickHouse = 14; bool DisableSslForGenericDataSources = 15; + bool ShowQueryTimeline = 16; + uint64 MaxQueryTimelineSize = 17; // default: 200KB } diff --git a/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp b/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp index b214e127d44c..9e7baaa95d6e 100644 --- a/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp +++ b/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp @@ -285,6 +285,10 @@ TPingTaskParams ConstructHardPingTask( internal.set_current_load(request.current_load()); } + if (request.timeline()) { + internal.set_timeline(request.timeline()); + } + if (request.flat_stats_size() != 0) { internal.clear_statistics(); auto stats = DeserializeFlatStats(request.flat_stats()); diff --git a/ydb/core/fq/libs/control_plane_storage/proto/yq_internal.proto b/ydb/core/fq/libs/control_plane_storage/proto/yq_internal.proto index 2da81596f71c..205117599e4e 100644 --- a/ydb/core/fq/libs/control_plane_storage/proto/yq_internal.proto +++ b/ydb/core/fq/libs/control_plane_storage/proto/yq_internal.proto @@ -57,6 +57,7 @@ message QueryInternal { NYql.NDqProto.StatusIds.StatusCode pending_status_code = 28; repeated StatisticsNamedValue statistics = 29; int32 current_load = 30; + string timeline = 31; } message JobInternal { diff --git a/ydb/core/fq/libs/control_plane_storage/util.cpp b/ydb/core/fq/libs/control_plane_storage/util.cpp index db0f310b509f..58f672e293c0 100644 --- a/ydb/core/fq/libs/control_plane_storage/util.cpp +++ b/ydb/core/fq/libs/control_plane_storage/util.cpp @@ -179,6 +179,7 @@ bool DoesPingTaskUpdateQueriesTable(const Fq::Private::PingTaskRequest& request) || !request.issues().empty() || !request.transient_issues().empty() || request.statistics() + || request.timeline() || !request.result_set_meta().empty() || request.ast() || request.ast_compressed().data() diff --git a/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_queries.cpp b/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_queries.cpp index ae371c52b811..966afc7c7957 100644 --- a/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_queries.cpp +++ b/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_queries.cpp @@ -594,6 +594,12 @@ void TYdbControlPlaneStorageActor::Handle(TEvControlPlaneStorage::TEvDescribeQue } } } + + auto timeline = internal.timeline(); + if (timeline) { + result.mutable_query()->mutable_timeline()->set_svg(timeline); + } + if (!permissions.Check(TPermissions::VIEW_AST)) { result.mutable_query()->clear_ast(); } else { diff --git a/ydb/core/fq/libs/protos/fq_private.proto b/ydb/core/fq/libs/protos/fq_private.proto index 04cc5d7c5d9d..879095566509 100644 --- a/ydb/core/fq/libs/protos/fq_private.proto +++ b/ydb/core/fq/libs/protos/fq_private.proto @@ -166,6 +166,7 @@ message PingTaskRequest { bool dump_raw_statistics = 38; repeated Ydb.ValuePair flat_stats = 39; int32 current_load = 40; + string timeline = 41; } message PingTaskResult { diff --git a/ydb/public/api/protos/draft/fq.proto b/ydb/public/api/protos/draft/fq.proto index d1713334a047..f76c01ef4944 100644 --- a/ydb/public/api/protos/draft/fq.proto +++ b/ydb/public/api/protos/draft/fq.proto @@ -205,6 +205,10 @@ message ResultSetMeta { bool truncated = 3; } +message QueryTimeline { + string svg = 1; // No validation because generated on server side +} + message Query { QueryMeta meta = 1; QueryContent content = 2; @@ -214,6 +218,7 @@ message Query { QueryStatistics statistics = 6; repeated ResultSetMeta result_set_meta = 7; QueryAst ast = 8; + QueryTimeline timeline = 9; } message QueryStatistics { diff --git a/ydb/public/lib/ydb_cli/commands/ydb_benchmark.cpp b/ydb/public/lib/ydb_cli/commands/ydb_benchmark.cpp index 16148eb13ee4..5576991f7d69 100644 --- a/ydb/public/lib/ydb_cli/commands/ydb_benchmark.cpp +++ b/ydb/public/lib/ydb_cli/commands/ydb_benchmark.cpp @@ -1,7 +1,8 @@ #include "ydb_benchmark.h" #include "benchmark_utils.h" -#include #include +#include +#include #include #include #include @@ -93,7 +94,7 @@ TString TWorkloadCommandBenchmark::PatchQuery(const TStringBuf& original) const std::vector lines; for (auto& line : StringSplitter(result).Split('\n').SkipEmpty()) { - if (line.StartsWith("--")) { + if (line.StartsWith("--") && !line.StartsWith("--!")) { continue; } @@ -386,6 +387,12 @@ bool TWorkloadCommandBenchmark::RunBench(TClient& client, NYdbWorkload::IWorkloa TFileOutput out(PlanFileName + ".ast"); out << res.GetPlanAst(); } + { + TPlanVisualizer pv; + pv.LoadPlans(res.GetQueryPlan()); + TFileOutput out(PlanFileName + ".svg"); + out << pv.PrintSvgSafe(); + } planSaved = true; } } diff --git a/ydb/public/lib/ydb_cli/common/plan2svg.cpp b/ydb/public/lib/ydb_cli/common/plan2svg.cpp new file mode 100644 index 000000000000..23b52ba9dfe2 --- /dev/null +++ b/ydb/public/lib/ydb_cli/common/plan2svg.cpp @@ -0,0 +1,1646 @@ +#include "plan2svg.h" + +#include + +constexpr ui32 INDENT_X = 8; +constexpr ui32 GAP_X = 3; +constexpr ui32 GAP_Y = 3; +constexpr ui32 TIME_HEIGHT = 10; +constexpr ui32 INTERNAL_GAP_Y = 2; +constexpr ui32 INTERNAL_GAP_X = 2; +constexpr ui32 INTERNAL_HEIGHT = 14; +constexpr ui32 INTERNAL_WIDTH = 16; +constexpr ui32 INTERNAL_TEXT_HEIGHT = 8; +constexpr ui32 TIME_SERIES_RANGES = 32; + +TString FormatDurationMs(ui64 durationMs) { + TStringBuilder builder; + + if (durationMs && durationMs < 100) { + builder << durationMs << "ms"; + } else { + auto seconds = durationMs / 1'000; + if (seconds >= 60) { + auto minutes = seconds / 60; + if (minutes >= 60) { + auto hours = minutes / 60; + builder << hours << 'h'; + if (hours < 24) { + auto minutes60 = minutes % 60; + builder << ' '; + if (minutes60 < 10) { + builder << '0'; + } + builder << minutes60 << 'm'; + } + } else { + auto seconds60 = seconds % 60; + builder << minutes << "m "; + if (seconds60 < 10) { + builder << '0'; + } + builder << seconds60 << 's'; + } + } else { + auto hundredths = (durationMs % 1'000) / 10; + builder << seconds << '.'; + if (hundredths < 10) { + builder << '0'; + } + builder << hundredths << 's'; + } + } + + return builder; +} + +TString FormatDurationUs(ui64 durationUs) { + if (durationUs && durationUs < 1000) { + return TStringBuilder() << durationUs << "us"; + } + + return FormatDurationMs(durationUs / 1000); +} + +TString FormatUsage(ui64 usec) { + return FormatDurationUs(usec); +} + +TString FormatIntegerValue(ui64 i, ui32 scale = 1000, const TString& suffix = "") { + if (i < scale) { + return Sprintf("%lu%s", i, suffix.c_str()); + } + for (auto c : "KMGTP") { + auto pcs = (i % scale) * 100 / scale; + i /= scale; + if (i < scale || c == 'P') { + return Sprintf("%lu.%.2lu%c%s", i, pcs, c, suffix.c_str()); + } + } + return ""; +} + +TString FormatBytes(ui64 bytes) { + return FormatIntegerValue(bytes, 1024, "B"); +} + +TString FormatTimeMs(ui64 time, bool shortFormat) { + if (shortFormat) { + time /= 10; + return Sprintf("%lu.%.2lu", time / 100, time % 100); + } else { + time /= 1000; + return Sprintf("%lu:%.2lu", time / 60, time % 60); + } +} + +TString FormatTimeMs(ui64 time) { + return FormatTimeMs(time, time < 60000); +} + +TString FormatTimeAgg(const TAggregation& agg, bool shortFormat) { + TStringBuilder result; + result << FormatTimeMs(agg.Min, shortFormat) << " | " << FormatTimeMs(agg.Avg, shortFormat) << " | " << FormatTimeMs(agg.Max, shortFormat); + return result; +} + +TString FormatMCpu(ui64 mCpu) { + mCpu /= 10; + return Sprintf("%lu.%.2lu", mCpu / 100, mCpu % 100); +} + +TString GetEstimation(const NJson::TJsonValue& node) { + TStringBuilder ebuilder; + auto* eCostNode = node.GetValueByPath("E-SelfCost"); + if (!eCostNode) { + eCostNode = node.GetValueByPath("E-Cost"); + } + if (eCostNode) { + auto costString = eCostNode->GetStringSafe(); + if (costString != "No estimate") { + ebuilder << "Est:"; + double cost; + if (TryFromString(costString, cost)) { + if (cost >= 1e+18) { + ebuilder << Sprintf(" %.2e", cost); + } else { + ebuilder << ' ' << FormatIntegerValue(static_cast(cost)); + } + } + if (auto* eRowsNode = node.GetValueByPath("E-Rows")) { + double rows; + if (TryFromString(eRowsNode->GetStringSafe(), rows)) { + if (rows >= 1e+18) { + ebuilder << Sprintf(" Rows: %.2e", rows); + } else { + ebuilder << " Rows: " << FormatIntegerValue(static_cast(rows)); + } + } + } + if (auto* eSizeNode = node.GetValueByPath("E-Size")) { + double size; + if (TryFromString(eSizeNode->GetStringSafe(), size)) { + if (size >= 1e+18) { + ebuilder << Sprintf(" Size: %.2e", size); + } else { + ebuilder << " Size: " << FormatBytes(static_cast(size)); + } + } + } + } + } + return ebuilder; +} + +bool TAggregation::Load(const NJson::TJsonValue& node) { + if (auto* countNode = node.GetValueByPath("Count")) { + Count = countNode->GetIntegerSafe(); + + if (Count == 0) { + return false; + } + + if (auto* sumNode = node.GetValueByPath("Sum")) { + Sum = sumNode->GetIntegerSafe(); + } + Avg = Sum / Count; + if (auto* minNode = node.GetValueByPath("Min")) { + Min = minNode->GetIntegerSafe(); + } else { + Min = Avg; + } + if (auto* maxNode = node.GetValueByPath("Max")) { + Max = maxNode->GetIntegerSafe(); + } else { + Max = Avg; + } + + return true; + } + return false; +} + +void TMetricHistory::Load(const NJson::TJsonValue& node, ui64 explicitMinTime, ui64 explicitMaxTime) { + std::vector times; + std::vector values; + + bool even = true; + + for (const auto& subNode : node.GetArray()) { + ui64 i = subNode.GetIntegerSafe(); + if (even) times.push_back(i); + else values.push_back(i); + even = !even; + } + + if (times.size() > values.size()) { + times.resize(values.size()); + } + + Load(times, values, explicitMinTime, explicitMaxTime); +} + +void TMetricHistory::Load(std::vector& times, std::vector& values, ui64 explicitMinTime, ui64 explicitMaxTime) { + if (times.size() < 2) { + return; + } + auto itt = times.begin(); + auto itv = values.begin(); + + MinTime = explicitMinTime ? explicitMinTime : *itt; + MaxTime = explicitMaxTime ? explicitMaxTime : times.back(); + + ui64 prevValue = *itv++; + ui64 prevTime = *itt++; + + while (itt != times.end() && *itt <= MinTime) { + prevValue = *itv++; + prevTime = *itt++; + } + + Deriv.resize(TIME_SERIES_RANGES + 1); + Deriv[0].first = MinTime; + + ui64 timeLeft = MinTime; + for (ui32 i = 1; i <= TIME_SERIES_RANGES; i++) { + + ui64 timeRight = MinTime + (MaxTime - MinTime) * i / TIME_SERIES_RANGES; + Deriv[i].first = timeRight; + while (itt != times.end() && *itt <= timeRight) { + ui64 delta = (*itv - prevValue); + if (prevTime >= timeLeft) { + Deriv[i].second += delta; + } else { + ui64 currDelta = delta * (*itt - timeLeft) / (*itt - prevTime); + Deriv[i].second += currDelta; + if (i > 1) { + Deriv[i - 1].second += delta - currDelta; + } + } + prevTime = *itt++; + prevValue = *itv++; + } + + timeLeft = timeRight; + } + + if (itt != times.end()) { + Deriv[TIME_SERIES_RANGES].second += (*itv - prevValue) * (*itt - MaxTime) / (*itt - prevTime); + } + for (ui32 i = 1; i <= TIME_SERIES_RANGES; i++) { + MaxDeriv = std::max(MaxDeriv, Deriv[i].second); + } + bool missed = false; + for (ui32 i = 0; i < times.size(); i++) { + auto t = times[i]; + if (t < MinTime) { + missed = true; + continue; + } + if (missed && t > MinTime) { + Values.emplace_back(MinTime, values[i - 1]); + } + if (t > MaxTime) { + if (i && times[i - 1] < MaxTime) { + Values.emplace_back(MaxTime, values[i - 1]); + } + break; + } + Values.emplace_back(t, values[i]); + } + for (auto& p : Values) { + MaxValue = std::max(MaxValue, p.second); + } +} + +TSingleMetric::TSingleMetric(std::shared_ptr summary, const NJson::TJsonValue& node, + const NJson::TJsonValue* firstMessageNode, const NJson::TJsonValue* lastMessageNode, + const NJson::TJsonValue* waitTimeUsNode) + : Summary(summary) { + + if (!Details.Load(node)) { + return; + } + + Summary->Add(Details.Sum); + + if (firstMessageNode) { + FirstMessage.Load(*firstMessageNode); + } + + if (lastMessageNode) { + LastMessage.Load(*lastMessageNode); + } + + if (auto* historyNode = node.GetValueByPath("History")) { + History.Load(*historyNode, FirstMessage.Min, LastMessage.Max); + MinTime = History.MinTime; + MaxTime = History.MaxTime; + } + + if (waitTimeUsNode) { + WaitTime.Load(*waitTimeUsNode, FirstMessage.Min, LastMessage.Max); + MinTime = MinTime ? std::min(MinTime, WaitTime.MinTime) : WaitTime.MinTime; + MaxTime = MaxTime ? std::max(MaxTime, WaitTime.MaxTime) : WaitTime.MaxTime; + } else if (FirstMessage.Min && LastMessage.Max) { + MinTime = MinTime ? std::min(MinTime, FirstMessage.Min) : FirstMessage.Min; + MaxTime = MaxTime ? std::max(MaxTime, LastMessage.Max) : LastMessage.Max; + } +} + +void TPlan::Load(const NJson::TJsonValue& node) { + if (auto* subplanNameNode = node.GetValueByPath("Subplan Name")) { + auto subplanName = subplanNameNode->GetStringSafe(); + if (subplanName.StartsWith("CTE ")) { + if (auto* nodeTypeNode = node.GetValueByPath("Node Type")) { + CteSubPlans[subplanName] = nodeTypeNode->GetStringSafe(); + } + } + } + + if (auto* subNode = node.GetValueByPath("Plans")) { + for (auto& plan : subNode->GetArray()) { + TString nodeType; + if (auto* nodeTypeNode = plan.GetValueByPath("Node Type")) { + nodeType = nodeTypeNode->GetStringSafe(); + } + if (auto* planNodeTypeNode = plan.GetValueByPath("PlanNodeType")) { + auto planNodeType = planNodeTypeNode->GetStringSafe(); + ythrow yexception() << "Unexpected plan node type [" << planNodeType << "]"; + } else { + Stages.push_back(std::make_shared(nodeType)); + LoadStage(Stages.back(), plan, 0); + } + } + } + + if (!TotalCpuTimes.empty()) { + TotalCpuTime.Load(TotalCpuTimes, TotalCpuValues, TotalCpuTimes.front(), TotalCpuTimes.back()); + } +} + +void TPlan::ResolveCteRefs() { + for (auto& memberRef : MemberRefs) { + auto it = CteSubPlans.find(memberRef.first); + if (it == CteSubPlans.end()) { + ythrow yexception() << "Can not find CTE Ref " << memberRef.first; + } + memberRef.second.first->Info.at(memberRef.second.second) = "Reference: " + it->second; + } + for (auto& cteRef : CteRefs) { + auto it = CteStages.find(cteRef.first); + if (it == CteStages.end()) { + ythrow yexception() << "Can not find CTE Ref " << cteRef.first; + } + + cteRef.second->FromStage = it->second; + if (cteRef.second->StatsNode) { + if (auto* inputNode = cteRef.second->StatsNode->GetValueByPath("Input")) { + for (const auto& subNode : inputNode->GetArray()) { + if (auto* nameNode = subNode.GetValueByPath("Name")) { + if (ToString(it->second->PlanNodeId) == nameNode->GetStringSafe()) { + if (auto* pushNode = subNode.GetValueByPath("Push")) { + if (auto* bytesNode = pushNode->GetValueByPath("Bytes")) { + cteRef.second->InputBytes = std::make_shared(InputBytes, + *bytesNode, + pushNode->GetValueByPath("FirstMessageMs"), + pushNode->GetValueByPath("LastMessageMs"), + pushNode->GetValueByPath("WaitTimeUs.History") + ); + MaxTime = std::max(MaxTime, cteRef.second->InputBytes->MaxTime); + } + if (auto* rowsNode = pushNode->GetValueByPath("Rows")) { + cteRef.second->InputRows = std::make_shared(InputRows, *rowsNode); + } + } + } + } + } + } + } + if (cteRef.second->FromStage->StatsNode) { + if (auto* outputNode = cteRef.second->FromStage->StatsNode->GetValueByPath("Output")) { + for (const auto& subNode : outputNode->GetArray()) { + if (auto* nameNode = subNode.GetValueByPath("Name")) { + if (ToString(cteRef.second->StagePlanNodeId) == nameNode->GetStringSafe()) { + if (auto* popNode = subNode.GetValueByPath("Pop")) { + if (auto* bytesNode = popNode->GetValueByPath("Bytes")) { + cteRef.second->CteOutputBytes = std::make_shared(OutputBytes, + *bytesNode, + popNode->GetValueByPath("FirstMessageMs"), + popNode->GetValueByPath("LastMessageMs"), + popNode->GetValueByPath("WaitTimeUs.History") + ); + MaxTime = std::max(MaxTime, cteRef.second->CteOutputBytes->MaxTime); + } + if (auto* rowsNode = popNode->GetValueByPath("Rows")) { + cteRef.second->CteOutputRows = std::make_shared(OutputRows, *rowsNode); + } + } + } + } + } + } + } + } +} + +void TPlan::LoadStage(std::shared_ptr stage, const NJson::TJsonValue& node, ui32 parentPlanNodeId) { + + if (auto* planNodeIdNode = node.GetValueByPath("PlanNodeId")) { + stage->PlanNodeId = planNodeIdNode->GetIntegerSafe(); + } + + if (auto* subplanNameNode = node.GetValueByPath("Subplan Name")) { + auto subplanName = subplanNameNode->GetStringSafe(); + if (subplanName.StartsWith("CTE ")) { + CteStages[subplanName] = stage; + } + } + + auto operators = node.GetValueByPath("Operators"); + + if (operators) { + TString prevFilter; + std::set references; + for (const auto& subNode : operators->GetArray()) { + if (auto* nameNode = subNode.GetValueByPath("Name")) { + auto name = nameNode->GetStringSafe(); + + if (name == "Iterator" || name == "Member") { + if (auto* referenceNode = subNode.GetValueByPath(name)) { + auto referenceName = referenceNode->GetStringSafe(); + if (references.contains(referenceName)) { + continue; + } + if (name == "Iterator" && !referenceName.StartsWith("precompute_")) { + continue; + } + } + } + + if (name == "Filter" && prevFilter) { + if (auto* predicateNode = subNode.GetValueByPath("Predicate")) { + auto filter = predicateNode->GetStringSafe(); + if (filter == prevFilter) { + continue; + } + } + } + prevFilter = ""; + + TStringBuilder builder; + + if (name == "Iterator" || name == "Member") { + builder << "Reference"; + } else { + builder << name; + } + + if (name == "Limit") { + if (auto* limitNode = subNode.GetValueByPath("Limit")) { + builder << ": " << limitNode->GetStringSafe(); + } + } else if (name == "Filter") { + if (auto* predicateNode = subNode.GetValueByPath("Predicate")) { + auto filter = predicateNode->GetStringSafe(); + prevFilter = filter; + while(true) { + auto p = filter.find("item."); + if (p == filter.npos) { + break; + } + filter.erase(p, 5); + } + while(true) { + auto p = filter.find('<'); + if (p == filter.npos) { + break; + } + filter.erase(p, 1); + filter.insert(p, "<"); + } + builder << ": " << filter; + } + } else if (name == "TopSort") { + if (auto* limitNode = subNode.GetValueByPath("Limit")) { + builder << ", Limit: " << limitNode->GetStringSafe(); + } + if (auto* topSortByNode = subNode.GetValueByPath("TopSortBy")) { + builder << ", TopSortBy: " << topSortByNode->GetStringSafe(); + } + } else if (name == "Iterator" || name == "Member") { + if (auto* referenceNode = subNode.GetValueByPath(name)) { + auto referenceName = referenceNode->GetStringSafe(); + references.insert(referenceName); + builder << ": " << referenceName; + auto cteRef = "CTE " + referenceName; + auto stageCopy = stage; + MemberRefs.emplace_back(cteRef, std::make_pair, ui32>(std::move(stageCopy), stage->Info.size())); + } + } else if (name.Contains("Join")) { + if (auto* conditionNode = subNode.GetValueByPath("Condition")) { + builder << " on " << conditionNode->GetStringSafe(); + } + } + stage->Info.push_back(builder); + + auto est = GetEstimation(subNode); + if (est) { + stage->Info.push_back(est); + } + } + } + } + + stage->StatsNode = node.GetValueByPath("Stats"); + + const NJson::TJsonValue* inputNode = nullptr; + + if (stage->StatsNode) { + + if (auto* tasksNode = stage->StatsNode->GetValueByPath("Tasks")) { + stage->Tasks = tasksNode->GetIntegerSafe(); + Tasks += stage->Tasks; + } + + if (auto* physicalStageIdNode = stage->StatsNode->GetValueByPath("PhysicalStageId")) { + stage->PhysicalStageId = physicalStageIdNode->GetIntegerSafe(); + } + + if (auto* baseTimeNode = stage->StatsNode->GetValueByPath("BaseTimeMs")) { + stage->BaseTime = baseTimeNode->GetIntegerSafe(); + if (BaseTime == 0) { + BaseTime = stage->BaseTime; + } else { + BaseTime = std::min(BaseTime, stage->BaseTime); + } + } + + if (auto* cpuTimeNode = stage->StatsNode->GetValueByPath("CpuTimeUs")) { + stage->CpuTime = std::make_shared(CpuTime, *cpuTimeNode); + + std::vector updatedCpuTimes; + std::vector updatedCpuValues; + + auto itt = TotalCpuTimes.begin(); + auto itv = TotalCpuValues.begin(); + auto ith = stage->CpuTime->History.Values.begin(); + + ui64 v0 = 0; + ui64 v1 = 0; + ui64 t = 0; + + while (itt != TotalCpuTimes.end() || ith != stage->CpuTime->History.Values.end()) { + + if (itt == TotalCpuTimes.end()) { + t = ith->first; + v1 = ith->second; + ith++; + } else if (ith == stage->CpuTime->History.Values.end()) { + t = *itt++; + v0 = *itv++; + } else if (*itt == ith->first) { + t = *itt++; + v0 = *itv++; + v1 = ith->second; + ith++; + } else if (*itt > ith->first) { + t = ith->first; + v1 = ith->second; + ith++; + } else { + t = *itt++; + v0 = *itv++; + } + + updatedCpuTimes.push_back(t); + updatedCpuValues.push_back(v0 + v1); + } + + TotalCpuTimes.swap(updatedCpuTimes); + TotalCpuValues.swap(updatedCpuValues); + } + + if (auto* mmuNode = stage->StatsNode->GetValueByPath("MaxMemoryUsage")) { + stage->MaxMemoryUsage = std::make_shared(MaxMemoryUsage, *mmuNode); + } + + if (auto* spillingComputeBytesNode = stage->StatsNode->GetValueByPath("SpillingComputeBytes")) { + stage->SpillingComputeBytes = std::make_shared(SpillingComputeBytes, *spillingComputeBytesNode); + } + + if (auto* spillingComputeTimeNode = stage->StatsNode->GetValueByPath("SpillingComputeTimeUs")) { + stage->SpillingComputeTime = std::make_shared(SpillingComputeTime, *spillingComputeTimeNode); + } + + if (auto* spillingChannelBytesNode = stage->StatsNode->GetValueByPath("SpillingChannelBytes")) { + stage->SpillingChannelBytes = std::make_shared(SpillingChannelBytes, *spillingChannelBytesNode); + } + + if (auto* spillingChannelTimeNode = stage->StatsNode->GetValueByPath("SpillingChannelTimeUs")) { + stage->SpillingChannelTime = std::make_shared(SpillingChannelTime, *spillingChannelTimeNode); + } + + if (auto* outputNode = stage->StatsNode->GetValueByPath("Output")) { + for (const auto& subNode : outputNode->GetArray()) { + if (auto* nameNode = subNode.GetValueByPath("Name")) { + if (ToString(parentPlanNodeId) == nameNode->GetStringSafe()) { + if (auto* popNode = subNode.GetValueByPath("Pop")) { + if (auto* bytesNode = popNode->GetValueByPath("Bytes")) { + stage->OutputBytes = std::make_shared(OutputBytes, + *bytesNode, + popNode->GetValueByPath("FirstMessageMs"), + popNode->GetValueByPath("LastMessageMs"), + popNode->GetValueByPath("WaitTimeUs.History") + ); + MaxTime = std::max(MaxTime, stage->OutputBytes->MaxTime); + } + if (auto* rowsNode = popNode->GetValueByPath("Rows")) { + stage->OutputRows = std::make_shared(OutputRows, *rowsNode); + } + } + } + } + } + } + + inputNode = stage->StatsNode->GetValueByPath("Input"); + } + + if (auto* subNode = node.GetValueByPath("Plans")) { + for (auto& plan : subNode->GetArray()) { + TString subNodeType; + if (auto* nodeTypeNode = plan.GetValueByPath("Node Type")) { + subNodeType = nodeTypeNode->GetStringSafe(); + } + TString planNodeType; + if (auto* planNodeTypeNode = plan.GetValueByPath("PlanNodeType")) { + planNodeType = planNodeTypeNode->GetStringSafe(); + } + if (planNodeType == "Connection") { + auto* keyColumnsNode = plan.GetValueByPath("KeyColumns"); + if (auto* subNode = plan.GetValueByPath("Plans")) { + for (auto& plan : subNode->GetArray()) { + TString nodeType; + if (auto* nodeTypeNode = plan.GetValueByPath("Node Type")) { + nodeType = nodeTypeNode->GetStringSafe(); + } + if (auto* planNodeTypeNode = plan.GetValueByPath("PlanNodeType")) { + auto planNodeType = planNodeTypeNode->GetStringSafe(); + if (planNodeType) { + ythrow yexception() << "Unexpected plan node type [" << planNodeType << "]"; + } + } + auto connection = std::make_shared(subNodeType, stage->PlanNodeId); + stage->Connections.push_back(connection); + if (keyColumnsNode) { + for (auto& keyColumn : keyColumnsNode->GetArray()) { + stage->Connections.back()->KeyColumns.push_back(keyColumn.GetStringSafe()); + } + } + + if (auto* planNodeIdNode = plan.GetValueByPath("PlanNodeId")) { + auto planNodeId = planNodeIdNode->GetStringRobust(); + if (inputNode) { + for (const auto& subNode : inputNode->GetArray()) { + if (auto* nameNode = subNode.GetValueByPath("Name")) { + if (planNodeId == nameNode->GetStringSafe()) { + if (auto* pushNode = subNode.GetValueByPath("Push")) { + if (auto* bytesNode = pushNode->GetValueByPath("Bytes")) { + connection->InputBytes = std::make_shared(InputBytes, + *bytesNode, + pushNode->GetValueByPath("FirstMessageMs"), + pushNode->GetValueByPath("LastMessageMs"), + pushNode->GetValueByPath("WaitTimeUs.History") + ); + MaxTime = std::max(MaxTime, connection->InputBytes->MaxTime); + } + if (auto* rowsNode = pushNode->GetValueByPath("Rows")) { + connection->InputRows = std::make_shared(InputRows, *rowsNode); + } + } + } + } + } + } + } + + Stages.push_back(std::make_shared(nodeType)); + connection->FromStage = Stages.back(); + LoadStage(Stages.back(), plan, stage->PlanNodeId); + } + } else if (auto* cteNameNode = plan.GetValueByPath("CTE Name")) { + auto cteName = "CTE " + cteNameNode->GetStringSafe(); + auto connection = std::make_shared(subNodeType, stage->PlanNodeId); + connection->CteConnection = true; + stage->Connections.push_back(connection); + if (keyColumnsNode) { + for (auto& keyColumn : keyColumnsNode->GetArray()) { + stage->Connections.back()->KeyColumns.push_back(keyColumn.GetStringSafe()); + } + } + CteRefs.emplace_back(cteName, stage->Connections.back()); + stage->Connections.back()->StatsNode = stage->StatsNode; + } + } else if (planNodeType == "") { + if (subNodeType == "Source") { + if (stage->Source) { + ythrow yexception() << "Plan stage already has linked Source [" << stage->Source->NodeType << "]"; + } + stage->Source = std::make_shared(subNodeType); + LoadSource(stage->Source, plan); + if (!stage->Source->Info.empty()) { + stage->Info.insert(stage->Info.end(), stage->Source->Info.begin(), stage->Source->Info.end()); + } + + if (stage->StatsNode) { + if (auto* ingressTopNode = stage->StatsNode->GetValueByPath("Ingress")) { + if (auto* ingressNode = (*ingressTopNode)[0].GetValueByPath("Ingress")) { + if (auto* bytesNode = ingressNode->GetValueByPath("Bytes")) { + stage->Source->IngressBytes = std::make_shared(IngressBytes, + *bytesNode, + ingressNode->GetValueByPath("FirstMessageMs"), + ingressNode->GetValueByPath("LastMessageMs"), + ingressNode->GetValueByPath("WaitTimeUs.History") + ); + MaxTime = std::max(MaxTime, stage->Source->IngressBytes->MaxTime); + } + if (auto* rowsNode = ingressNode->GetValueByPath("Rows")) { + stage->Source->IngressRows = std::make_shared(IngressRows, *rowsNode); + } + } + } + } + + } else { + stage->Connections.push_back(std::make_shared("Implicit", stage->PlanNodeId)); + Stages.push_back(std::make_shared(subNodeType)); + stage->Connections.back()->FromStage = Stages.back(); + LoadStage(Stages.back(), plan, stage->PlanNodeId); + } + } else { + ythrow yexception() << "Unexpected plan node type [" << planNodeType << "]"; + } + } + } +} + +void TPlan::LoadSource(std::shared_ptr source, const NJson::TJsonValue& node) { + + auto operators = node.GetValueByPath("Operators"); + + if (operators) { + for (const auto& subNode : operators->GetArray()) { + TStringBuilder builder; + builder << "Source"; + if (auto* sourceTypeNode = subNode.GetValueByPath("SourceType")) { + builder << " " << sourceTypeNode->GetStringSafe(); + } + if (auto* nameNode = subNode.GetValueByPath("Name")) { + builder << " " << nameNode->GetStringSafe() << "("; + } + if (auto* readColumnsNode = subNode.GetValueByPath("ReadColumns")) { + bool firstColumn = true; + for (const auto& subNode : readColumnsNode->GetArray()) { + if (firstColumn) { + firstColumn = false; + } else { + builder << ", "; + } + builder << subNode.GetStringSafe(); + } + } + builder << ")"; + source->Info.push_back(builder); + + auto est = GetEstimation(subNode); + if (est) { + source->Info.push_back(est); + } + break; + } + } +} + +void TPlan::MarkStageIndent(ui32 indent, ui32& offsetY, std::shared_ptr stage) { + if (stage->IndentX < indent) { + stage->IndentX = indent; + } + + stage->OffsetY = offsetY; + ui32 height = std::max(stage->Connections.size() + (stage->Source ? 1 : 0) + 3, 4) * (INTERNAL_HEIGHT + INTERNAL_GAP_Y) + INTERNAL_GAP_Y; + stage->Height = height; + stage->IndentY = stage->OffsetY + height; + offsetY += (height + GAP_Y); + + if (stage->Connections.size() > 1) { + indent += (INDENT_X + GAP_X); + } + + for (auto c : stage->Connections) { + if (c->CteConnection) { + c->CteIndentX = indent; + c->CteOffsetY = offsetY; + offsetY += GAP_Y + INTERNAL_HEIGHT + INTERNAL_GAP_Y * 2; + stage->IndentY = std::max(stage->IndentY, offsetY); + } else { + MarkStageIndent(indent, offsetY, c->FromStage); + stage->IndentY = std::max(stage->IndentY, c->FromStage->IndentY); + } + } +} + +void TPlan::MarkLayout() { + ui32 offsetY = 0; + MarkStageIndent(0, offsetY, Stages.front()); + // Compress Reference(s) + for (auto& stage : Stages) { + auto& info = stage->Info; + ui32 i = 0; + while (i < info.size()) { + auto& s = info[i]; + if (s.starts_with("Reference: ")) { + auto next = i + 1; + if (next < info.size()) { + auto& sn = info[next]; + if (sn.starts_with("Reference: ")) { + s.insert(9, "s"); + while (next < info.size()) { + auto& sn = info[next]; + if (sn.starts_with("Reference: ")) { + s += ", " + sn.substr(11); + info.erase(info.begin() + next); + } else { + break; + } + } + } + } + } + i++; + } + } +} + +void TPlan::PrintTimeline(TStringBuilder& background, TStringBuilder& canvas, const TString& title, TAggregation& firstMessage, TAggregation& lastMessage, ui32 x, ui32 y, ui32 w, ui32 h, const TString& color) { + + auto firstMin = firstMessage.Min * w / MaxTime; + auto lastMax = lastMessage.Max * w / MaxTime; + + background + << "" << title << ", Duration: " << FormatTimeMs(lastMessage.Max - firstMessage.Min) << " (" << FormatTimeAgg(firstMessage, lastMessage.Max < 60000) << " - " << FormatTimeAgg(lastMessage, lastMessage.Max < 60000) << ")" + << "" << Endl; + + if (firstMessage.Min < firstMessage.Max) { + auto firstAvg = firstMessage.Avg * w / MaxTime; + auto firstMax = firstMessage.Max * w / MaxTime; + canvas + << " " << Endl + << " " << Endl; + } + + if (lastMessage.Min < lastMessage.Max) { + auto lastMin = lastMessage.Min * w / MaxTime; + auto lastAvg = lastMessage.Avg * w / MaxTime; + canvas + << " " << Endl + << " " << Endl; + } + + background + << "" << Endl; +} + +void TPlan::PrintWaitTime(TStringBuilder& background, std::shared_ptr metric, ui32 x, ui32 y, ui32 w, ui32 h, const TString& fillColor) { + + if (metric->WaitTime.MaxDeriv == 0) { + return; + } + + background + << "" << Endl; +} + +void TPlan::PrintDeriv(TStringBuilder& canvas, TMetricHistory& history, ui32 x, ui32 y, ui32 w, ui32 h, const TString& title, const TString& lineColor, const TString& fillColor) { + + if (history.MaxDeriv == 0) { + return; + } + + if (title) { + canvas << "" << title << "" << Endl; + } + + canvas + << (fillColor ? "(item.second * h / history.MaxDeriv, 1)) << " "; + } + + canvas + << x + history.Deriv.back().first * w / MaxTime << "," << y + (h - 1) << " " + << "' stroke-width='1' stroke='" << lineColor << "' fill='" << (fillColor ? fillColor : "none") << "' />" << Endl; + + if (title) { + canvas << "" << Endl; + } +} + +void TPlan::PrintValues(TStringBuilder& canvas, std::shared_ptr metric, ui32 x, ui32 y, ui32 w, ui32 h, const TString& title, const TString& lineColor, const TString& fillColor) { + + if (metric->History.MaxValue == 0) { + return; + } + + if (title) { + canvas << "" << title << "" << Endl; + } + + canvas + << (fillColor ? "History.Values.front().first * w / MaxTime << "," << y + (h - 1) << " "; + + for (auto& item : metric->History.Values) { + canvas << x + item.first * w / MaxTime << "," << y + (h - std::max(item.second * h / metric->History.MaxValue, 1)) << " "; + } + + canvas + << x + metric->History.Values.back().first * w / MaxTime << "," << y + (h - 1) << " " + << "' stroke-width='1' stroke='" << lineColor << "' fill='" << (fillColor ? fillColor : "none") << "' />" << Endl; + + if (title) { + canvas << "" << Endl; + } +} + +void TPlan::PrintStageSummary(TStringBuilder& background, TStringBuilder&, ui32 y0, std::shared_ptr metric, const TString& mediumColor, const TString& lightColor, const TString& textSum, const TString& tooltip) { + ui32 x0 = Config.HeaderWidth + GAP_X + INTERNAL_GAP_X; + ui32 width = Config.SummaryWidth - INTERNAL_GAP_X * 2; + if (metric->Summary && metric->Summary->Max) { + width = metric->Details.Sum * width / metric->Summary->Max; + } + background + << "" << tooltip << "" << Endl; + if (metric->Details.Max) { + auto wavg = width / 2; + if (metric->Details.Max > metric->Details.Min) { + wavg = (metric->Details.Avg - metric->Details.Min) * width / (metric->Details.Max - metric->Details.Min); + } + background + << " " + << " " << Endl; + } else { + background + << " " << Endl; + } + if (textSum) { + background + << "" << Endl + << "" << textSum << "" << Endl; + } + background + << "" << Endl; +} + +void TPlan::PrintSvg(ui64 maxTime, ui32& offsetY, TStringBuilder& background, TStringBuilder& canvas) { + OffsetY = offsetY; + ui32 planHeight = 0; + + for (auto& s : Stages) { + planHeight = std::max(planHeight, s->IndentY); + background + << "" << Endl; + auto x = Config.HeaderWidth + GAP_X; + background + << "" << Endl; + x += Config.SummaryWidth + GAP_X; + background + << "" << Endl; + if (s->Connections.size() > 1) { + ui32 y = s->OffsetY + s->Height; + background + << "" << Endl; + } + background + << "" << Endl + << "" << ToString(s->PhysicalStageId) << "" << Endl; + + { + ui32 y0 = s->OffsetY + INTERNAL_TEXT_HEIGHT + (INTERNAL_HEIGHT - INTERNAL_TEXT_HEIGHT) / 2 + offsetY; + if (!s->Info.empty()) { + for (auto text : s->Info) { + canvas + << "" << text << "" << Endl; + y0 += (INTERNAL_TEXT_HEIGHT + INTERNAL_GAP_Y); + } + } else { + canvas + << "" << s->NodeType << "" << Endl; + } + } + + + ui32 y0 = s->OffsetY + offsetY + INTERNAL_GAP_Y; + + auto tx0 = Config.HeaderWidth + GAP_X + Config.SummaryWidth + GAP_X + INTERNAL_GAP_X; + auto tx1 = Config.Width - INTERNAL_GAP_X; + auto tw = tx1 - tx0; + auto px = tx0 + TimeOffset * tw / maxTime; + auto pw = MaxTime * tw / maxTime; + + auto taskCount = s->CpuTime ? s->CpuTime->Details.Count : 0; + + if (s->OutputBytes) { + auto textSum = FormatBytes(s->OutputBytes->Details.Sum); + TStringBuilder tooltip; + tooltip + << "Output " + << s->OutputBytes->Details.Sum * 100 / s->OutputBytes->Summary->Value << "%, \u2211" + << textSum << ", " << FormatBytes(s->OutputBytes->Details.Min) << " | " + << FormatBytes(s->OutputBytes->Details.Avg) << " | " << FormatBytes(s->OutputBytes->Details.Max); + if (s->OutputRows && s->OutputRows->Details.Sum) { + tooltip + << ", Rows \u2211" + << FormatIntegerValue(s->OutputRows->Details.Sum) << ", " << FormatIntegerValue(s->OutputRows->Details.Min) << " | " + << FormatIntegerValue(s->OutputRows->Details.Avg) << " | " << FormatIntegerValue(s->OutputRows->Details.Max) + << ", Width " << FormatBytes(s->OutputBytes->Details.Sum / s->OutputRows->Details.Sum); + } + PrintStageSummary(background, canvas, y0, s->OutputBytes, Config.Palette.OutputMedium, Config.Palette.OutputLight, textSum, tooltip); + + if (s->SpillingChannelBytes && s->SpillingChannelBytes->Details.Sum) { + auto textSum = FormatBytes(s->SpillingChannelBytes->Details.Sum); + auto x1 = Config.HeaderWidth + GAP_X + Config.SummaryWidth + - INTERNAL_GAP_X; + auto x0 = x1 - textSum.size() * INTERNAL_TEXT_HEIGHT * 7 / 10; + background + << "" << "Channel Spilling \u2211" << textSum + << ", " << FormatBytes(s->SpillingChannelBytes->Details.Min) << " | " + << FormatBytes(s->SpillingChannelBytes->Details.Avg) << " | " << FormatBytes(s->SpillingChannelBytes->Details.Max) + << "" << Endl + << "" << Endl + << "" << textSum << "" << Endl + << "" << Endl; + } + + if (s->OutputBytes->Details.Count != taskCount) { + canvas + << "" << s->OutputBytes->Details.Count << "" << Endl; + } + + auto d = s->OutputBytes->MaxTime - s->OutputBytes->MinTime; + TStringBuilder title; + title << "Output"; + if (d) { + title << " " << FormatBytes(s->OutputBytes->Details.Sum * 1000 / d) << "/s"; + if (s->OutputRows) { + title << ", Rows " << FormatIntegerValue(s->OutputRows->Details.Sum * 1000 / d) << "/s"; + } + } + PrintTimeline(background, canvas, title, s->OutputBytes->FirstMessage, s->OutputBytes->LastMessage, px, y0, pw, INTERNAL_HEIGHT, Config.Palette.OutputMedium); + + if (!s->OutputBytes->WaitTime.Deriv.empty()) { + PrintWaitTime(background, s->OutputBytes, px, y0, pw, INTERNAL_HEIGHT, Config.Palette.OutputLight); + } + + if (!s->OutputBytes->History.Deriv.empty()) { + PrintDeriv(canvas, s->OutputBytes->History, px, y0, pw, INTERNAL_HEIGHT, "", Config.Palette.OutputDark); + } + } + + // Output is mandatory metric + y0 += INTERNAL_HEIGHT + INTERNAL_GAP_Y; + + if (s->MaxMemoryUsage) { + auto textSum = FormatBytes(s->MaxMemoryUsage->Details.Sum); + TStringBuilder tooltip; + tooltip + << "Memory " + << s->MaxMemoryUsage->Details.Sum * 100 / s->MaxMemoryUsage->Summary->Value << "%, \u2211" + << textSum << ", " << FormatBytes(s->MaxMemoryUsage->Details.Min) << " | " + << FormatBytes(s->MaxMemoryUsage->Details.Avg) << " | " << FormatBytes(s->MaxMemoryUsage->Details.Max); + PrintStageSummary(background, canvas, y0, s->MaxMemoryUsage, Config.Palette.MemMedium, Config.Palette.MemLight, textSum, tooltip); + + if (s->SpillingComputeBytes && s->SpillingComputeBytes->Details.Sum) { + auto textSum = FormatBytes(s->SpillingComputeBytes->Details.Sum); + auto x1 = Config.HeaderWidth + GAP_X + Config.SummaryWidth + - INTERNAL_GAP_X; + auto x0 = x1 - textSum.size() * INTERNAL_TEXT_HEIGHT * 7 / 10; + background + << "" << "Compute Spilling \u2211" << textSum + << ", " << FormatBytes(s->SpillingComputeBytes->Details.Min) << " | " + << FormatBytes(s->SpillingComputeBytes->Details.Avg) << " | " << FormatBytes(s->SpillingComputeBytes->Details.Max) + << "" << Endl + << "" << Endl + << "" << textSum << "" << Endl + << "" << Endl; + } + + if (s->MaxMemoryUsage->Details.Count != taskCount) { + canvas + << "" << s->MaxMemoryUsage->Details.Count << "" << Endl; + } + + if (!s->MaxMemoryUsage->History.Values.empty()) { + PrintValues(canvas, s->MaxMemoryUsage, px, y0, pw, INTERNAL_HEIGHT, "Max MEM " + FormatBytes(s->MaxMemoryUsage->History.MaxValue), Config.Palette.MemMedium, Config.Palette.MemMedium); + } + + if (s->SpillingComputeBytes && !s->SpillingComputeBytes->History.Deriv.empty()) { + PrintDeriv(canvas, s->SpillingComputeBytes->History, px, y0, pw, INTERNAL_HEIGHT, "Spilling Compute", Config.Palette.SpillingBytesMedium, Config.Palette.SpillingBytesLight); + } + } + + y0 += INTERNAL_HEIGHT + INTERNAL_GAP_Y; + + if (s->CpuTime) { + auto textSum = FormatUsage(s->CpuTime->Details.Sum); + TStringBuilder tooltip; + tooltip + << "CPU Usage " + << s->CpuTime->Details.Sum * 100 / s->CpuTime->Summary->Value << "%, \u2211" + << textSum << ", " << FormatUsage(s->CpuTime->Details.Min) << " | " + << FormatUsage(s->CpuTime->Details.Avg) << " | " << FormatUsage(s->CpuTime->Details.Max); + PrintStageSummary(background, canvas, y0, s->CpuTime, Config.Palette.CpuMedium, Config.Palette.CpuLight, textSum, tooltip); + + if (taskCount) { + canvas + << "" << taskCount << "" << Endl; + } + + if (!s->CpuTime->History.Deriv.empty()) { + auto maxCpu = s->CpuTime->History.MaxDeriv * TIME_SERIES_RANGES / (s->CpuTime->History.MaxTime - s->CpuTime->History.MinTime); + PrintDeriv(canvas, s->CpuTime->History, px, y0, pw, INTERNAL_HEIGHT, "Max CPU " + FormatMCpu(maxCpu), Config.Palette.CpuMedium, Config.Palette.CpuLight); + } + + if (s->SpillingComputeTime && !s->SpillingComputeTime->History.Deriv.empty()) { + PrintDeriv(canvas, s->SpillingComputeTime->History, px, y0, pw, INTERNAL_HEIGHT, "Spilling Compute", Config.Palette.SpillingTimeMedium); + } + } + + y0 += INTERNAL_HEIGHT + INTERNAL_GAP_Y; + + for (auto& c : s->Connections) { + + auto x = c->CteConnection ? c->CteIndentX : c->FromStage->IndentX; + auto y = (c->CteConnection ? c->CteOffsetY : c->FromStage->OffsetY) + offsetY; + + if (c->CteConnection) { + auto xx = x; + background + << "" << Endl; + xx = Config.HeaderWidth + GAP_X; + background + << "" << Endl; + xx += Config.SummaryWidth + GAP_X; + background + << "" << Endl; + background + << "" << Endl + << "" << ToString(c->FromStage->PhysicalStageId) << "" << Endl; + + auto s = c->FromStage->Info.empty() ? c->FromStage->NodeType : c->FromStage->Info[0]; + canvas + << "" << s << "" << Endl; + + if (c->CteOutputBytes) { + auto textSum = FormatBytes(c->CteOutputBytes->Details.Sum); + TStringBuilder tooltip; + tooltip + << "Output " + << c->CteOutputBytes->Details.Sum * 100 / c->CteOutputBytes->Summary->Value << "%, \u2211" + << textSum << ", " << FormatBytes(c->CteOutputBytes->Details.Min) << " | " + << FormatBytes(c->CteOutputBytes->Details.Avg) << " | " << FormatBytes(c->CteOutputBytes->Details.Max); + if (c->CteOutputRows && c->CteOutputRows->Details.Sum) { + tooltip + << ", Rows \u2211" + << FormatIntegerValue(c->CteOutputRows->Details.Sum) << ", " << FormatIntegerValue(c->CteOutputRows->Details.Min) << " | " + << FormatIntegerValue(c->CteOutputRows->Details.Avg) << " | " << FormatIntegerValue(c->CteOutputRows->Details.Max) + << ", Width " << FormatBytes(c->CteOutputBytes->Details.Sum / c->CteOutputRows->Details.Sum); + } + PrintStageSummary(background, canvas, y + INTERNAL_GAP_Y, c->CteOutputBytes, Config.Palette.OutputMedium, Config.Palette.OutputLight, textSum, tooltip); + + canvas + << "" << c->CteOutputBytes->Details.Count << "" << Endl; + + auto d = c->CteOutputBytes->MaxTime - c->CteOutputBytes->MinTime; + TStringBuilder title; + title << "Output"; + if (d) { + title << " " << FormatBytes(c->CteOutputBytes->Details.Sum * 1000 / d) << "/s"; + if (c->CteOutputRows) { + title << ", Rows " << FormatIntegerValue(c->CteOutputRows->Details.Sum * 1000 / d) << "/s"; + } + } + PrintTimeline(background, canvas, title, c->CteOutputBytes->FirstMessage, c->CteOutputBytes->LastMessage, px, y + INTERNAL_GAP_Y, pw, INTERNAL_HEIGHT, Config.Palette.OutputMedium); + + if (!c->CteOutputBytes->WaitTime.Deriv.empty()) { + PrintWaitTime(background, c->CteOutputBytes, px, y + INTERNAL_GAP_Y, pw, INTERNAL_HEIGHT, Config.Palette.OutputLight); + } + + if (!c->CteOutputBytes->History.Deriv.empty()) { + PrintDeriv(canvas, c->CteOutputBytes->History, px, y + INTERNAL_GAP_Y, pw, INTERNAL_HEIGHT, "", Config.Palette.OutputDark); + } + } + } + + TString mark; + if (c->NodeType == "HashShuffle") mark = "H"; + else if (c->NodeType == "Merge") mark = "Me"; + else if (c->NodeType == "Map") mark = "Ma"; + else if (c->NodeType == "UnionAll") mark = "U"; + else if (c->NodeType == "Broadcast") mark = "B"; + else mark = "?"; + + canvas + << "Connection: " << c->NodeType; + if (!c->KeyColumns.empty()) { + canvas << " KeyColumns: "; + bool first = true; + for (auto k : c->KeyColumns) { + if (first) { + first = false; + } else { + canvas << ", "; + } + canvas << k; + } + } + canvas + << "" << Endl + << " " << Endl + << " " << mark << "" << Endl + << "" << Endl; + + if (c->InputBytes) { + auto textSum = FormatBytes(c->InputBytes->Details.Sum); + TStringBuilder tooltip; + tooltip + << "Input " + << c->InputBytes->Details.Sum * 100 / c->InputBytes->Summary->Value << "%, \u2211" + << textSum << ", " << FormatBytes(c->InputBytes->Details.Min) << " | " + << FormatBytes(c->InputBytes->Details.Avg) << " | " << FormatBytes(c->InputBytes->Details.Max); + if (c->InputRows && c->InputRows->Details.Sum) { + tooltip + << ", Rows \u2211" + << FormatIntegerValue(c->InputRows->Details.Sum) << ", " << FormatIntegerValue(c->InputRows->Details.Min) << " | " + << FormatIntegerValue(c->InputRows->Details.Avg) << " | " << FormatIntegerValue(c->InputRows->Details.Max) + << ", Width " << FormatBytes(c->InputBytes->Details.Sum / c->InputRows->Details.Sum); + } + PrintStageSummary(background, canvas, y0, c->InputBytes, Config.Palette.InputMedium, Config.Palette.InputLight, textSum, tooltip); + + if (c->InputBytes->Details.Count != taskCount) { + canvas + << "" << c->InputBytes->Details.Count << "" << Endl; + } + + auto d = c->InputBytes->MaxTime - c->InputBytes->MinTime; + TStringBuilder title; + title << "Input"; + if (d) { + title << " " << FormatBytes(c->InputBytes->Details.Sum * 1000 / d) << "/s"; + if (c->InputRows) { + title << ", Rows " << FormatIntegerValue(c->InputRows->Details.Sum * 1000 / d) << "/s"; + } + } + PrintTimeline(background, canvas, title, c->InputBytes->FirstMessage, c->InputBytes->LastMessage, px, y0, pw, INTERNAL_HEIGHT, Config.Palette.InputMedium); + + if (!c->InputBytes->WaitTime.Deriv.empty()) { + PrintWaitTime(background, c->InputBytes, px, y0, pw, INTERNAL_HEIGHT, Config.Palette.InputLight); + } + + if (!c->InputBytes->History.Deriv.empty()) { + PrintDeriv(canvas, c->InputBytes->History, px, y0, pw, INTERNAL_HEIGHT, "", Config.Palette.InputDark); + } + + y0 += INTERNAL_HEIGHT + INTERNAL_GAP_Y; + } + } + + if (s->Source && s->Source->IngressBytes) { + auto textSum = FormatBytes(s->Source->IngressBytes->Details.Sum); + TStringBuilder tooltip; + tooltip + << "Ingress " + << s->Source->IngressBytes->Details.Sum * 100 / s->Source->IngressBytes->Summary->Value << "%, \u2211" + << textSum << ", " << FormatBytes(s->Source->IngressBytes->Details.Min) << " | " + << FormatBytes(s->Source->IngressBytes->Details.Avg) << " | " << FormatBytes(s->Source->IngressBytes->Details.Max); + if (s->Source->IngressRows && s->Source->IngressRows->Details.Sum) { + tooltip + << ", Rows \u2211" + << FormatIntegerValue(s->Source->IngressRows->Details.Sum) << ", " << FormatIntegerValue(s->Source->IngressRows->Details.Min) << " | " + << FormatIntegerValue(s->Source->IngressRows->Details.Avg) << " | " << FormatIntegerValue(s->Source->IngressRows->Details.Max) + << ", Width " << FormatBytes(s->Source->IngressBytes->Details.Sum / s->Source->IngressRows->Details.Sum); + } + PrintStageSummary(background, canvas, y0, s->Source->IngressBytes, Config.Palette.IngressMedium, Config.Palette.IngressLight, textSum, tooltip); + + if (s->Source->IngressBytes->Details.Count != taskCount) { + canvas + << "" << s->Source->IngressBytes->Details.Count << "" << Endl; + } + + auto d = s->Source->IngressBytes->MaxTime - s->Source->IngressBytes->MinTime; + TStringBuilder title; + title << "Ingress"; + if (d) { + title << " " << FormatBytes(s->Source->IngressBytes->Details.Sum * 1000 / d) << "/s"; + if (s->Source->IngressRows) { + title << ", Rows " << FormatIntegerValue(s->Source->IngressRows->Details.Sum / d) << "/s"; + } + } + PrintTimeline(background, canvas, title, s->Source->IngressBytes->FirstMessage, s->Source->IngressBytes->LastMessage, px, y0, pw, INTERNAL_HEIGHT, Config.Palette.IngressMedium); + + if (!s->Source->IngressBytes->WaitTime.Deriv.empty()) { + PrintWaitTime(background, s->Source->IngressBytes, px, y0, pw, INTERNAL_HEIGHT, Config.Palette.IngressLight); + } + + if (!s->Source->IngressBytes->History.Deriv.empty()) { + PrintDeriv(canvas, s->Source->IngressBytes->History, px, y0, pw, INTERNAL_HEIGHT, "", Config.Palette.IngressDark); + } + + y0 += INTERNAL_HEIGHT + INTERNAL_GAP_Y; + } + } + + offsetY += planHeight; +} + +TColorPalette::TColorPalette() { + StageMain = "var(--stage-main, #F2F2F2)"; + StageClone = "var(--stage-clone, #D9D9D9"; + StageText = "var(--stage-text, #262626)"; + StageTextHighlight = "var(--stage-texthl, #EA0703)"; + StageGrid = "var(--stage-grid, #B2B2B2"; + IngressDark = "var(--ingress-dark, #574F38)"; + IngressMedium = "var(--ingress-medium, #82723C)"; + IngressLight = "var(--ingress-light, #C0A645)"; + InputDark = "var(--input-dark, #315B34)"; + InputMedium = "var(--input-medium, #379A33)"; + InputLight = "var(--input-light, #3AC936)"; + OutputDark = "var(--output-dark, #3F5799)"; + OutputMedium = "var(--output-medium, #4E79EB)"; + OutputLight = "var(--output-light, #86A8FF)"; + MemMedium = "var(--mem-medium, #543B70)"; + MemLight = "var(--mem-light, #854EBD)"; + CpuMedium = "var(--cpu-medium, #EA0703)"; + CpuLight = "var(--cpu-light, #FF6866)"; + ConnectionFill= "var(--conn-fill, #BFBFBF)"; + ConnectionLine= "var(--conn-line, #BFBFBF)"; + ConnectionText= "var(--conn-text, #393939)"; + MinMaxLine = "var(--minmax-line, #FFDB4D)"; + TextLight = "var(--text-light, #FFFFFF)"; + TextInverted = "var(--text-inv, #FFFFFF)"; + TextSummary = "var(--text-summary, #262626)"; + SpillingBytesDark = "var(--spill-dark, #406B61)"; + SpillingBytesMedium = "var(--spill-medium, #599587)"; + SpillingBytesLight = "var(--spill-light, #72C0AE)"; + SpillingTimeDark = "var(--spill-dark, #406B61)"; + SpillingTimeMedium = "var(--spill-medium, #599587)"; + SpillingTimeLight = "var(--spill-light, #72C0AE)"; +} + +TPlanViewConfig::TPlanViewConfig() { + HeaderWidth = 300; + SummaryWidth = 128; + Width = 1024; +} + + +void TPlanVisualizer::LoadPlans(const TString& plans) { + NJson::TJsonReaderConfig jsonConfig; + NJson::TJsonValue jsonNode; + if (NJson::ReadJsonTree(plans, &jsonConfig, &jsonNode)) { + if (auto* topNode = jsonNode.GetValueByPath("Plan")) { + if (auto* subNode = topNode->GetValueByPath("Plans")) { + for (auto& plan : subNode->GetArray()) { + if (auto* typeNode = plan.GetValueByPath("Node Type")) { + auto nodeType = typeNode->GetStringSafe(); + LoadPlan(nodeType, plan); + } + } + } + } + } + PostProcessPlans(); +} + +void TPlanVisualizer::LoadPlan(const TString& nodeType, const NJson::TJsonValue& node) { + Plans.emplace_back(nodeType, Config, CteStages, CteSubPlans); + Plans.back().Load(node); +} + +void TPlanVisualizer::PostProcessPlans() { + // Fix CTE Refs + for (auto& p : Plans) { + p.ResolveCteRefs(); + } + // Fix Layouts + for (auto& p : Plans) { + p.MarkLayout(); + if (BaseTime == 0) { + BaseTime = p.BaseTime; + } else { + BaseTime = std::min(BaseTime, p.BaseTime); + } + } + // Fix time Offsets + for (auto& p : Plans) { + p.TimeOffset = p.BaseTime - BaseTime; + MaxTime = std::max(MaxTime, p.TimeOffset + p.MaxTime); + } +} + +TString TPlanVisualizer::PrintSvgSafe() { + try { + return PrintSvg(); + } catch (std::exception& e) { + return Sprintf("%s", e.what()); + } +} + +TString TPlanVisualizer::PrintSvg() { + TStringBuilder background; + TStringBuilder canvas; + TStringBuilder svg; + + ui32 offsetY = 0; + + ui32 summary3 = (Config.SummaryWidth - INTERNAL_GAP_X * 2) / 3; + for (auto& p : Plans) { + offsetY += GAP_Y; + canvas + << "" + << p.NodeType << "" << Endl; + + canvas + << "" << ToString(p.Tasks) << "" << Endl; + + canvas + << "Ingress " + << FormatBytes(p.IngressBytes->Value) << ", Rows " << FormatIntegerValue(p.IngressRows->Value); + if (p.IngressRows->Value) { + canvas + << ", Width " << p.IngressBytes->Value / p.IngressRows->Value << "B"; + } + if (p.MaxTime) { + canvas + << ", Avg " << FormatBytes(p.IngressBytes->Value * 1000 / p.MaxTime) << "/s"; + } + canvas + << "" << Endl + << " " << Endl + << " " << FormatBytes(p.IngressBytes->Value) << "" << Endl + << "" << Endl; + + canvas + << "CPU Usage " << FormatUsage(p.CpuTime->Value); + if (p.MaxTime) { + auto usagePS = p.CpuTime->Value / p.MaxTime; + usagePS /= 10; + canvas + << ", Avg " << Sprintf("%lu.%.2lu", usagePS / 100, usagePS % 100) << " CPU/s"; + } + canvas + << "" << Endl + << " " << Endl + << " " << FormatUsage(p.CpuTime->Value) << "" << Endl + << "" << Endl; + + canvas + << "Memory " << FormatBytes(p.MaxMemoryUsage->Value) << "" << Endl + << " " << Endl + << " " << FormatBytes(p.MaxMemoryUsage->Value) << "" << Endl + << "" << Endl; + + auto w = Config.Width - (Config.HeaderWidth + GAP_X + Config.SummaryWidth + GAP_X); + auto x = (Config.HeaderWidth + GAP_X + Config.SummaryWidth + GAP_X) + w * (p.MaxTime + p.TimeOffset) / MaxTime; + canvas + << "" << "Duration: " << FormatTimeMs(p.MaxTime) << ", Total " << FormatTimeMs(p.MaxTime + p.TimeOffset) << "" << Endl + << " " << Endl + << " " << FormatTimeMs(p.MaxTime + p.TimeOffset) << "" << Endl + << "" << Endl; + + offsetY += TIME_HEIGHT; + if (!p.TotalCpuTime.Deriv.empty()) { + + auto tx0 = Config.HeaderWidth + GAP_X + Config.SummaryWidth + GAP_X + INTERNAL_GAP_X; + auto tx1 = Config.Width - INTERNAL_GAP_X; + auto tw = tx1 - tx0; + auto maxCpu = p.TotalCpuTime.MaxDeriv * TIME_SERIES_RANGES / (p.TotalCpuTime.MaxTime - p.TotalCpuTime.MinTime); + p.PrintDeriv(canvas, p.TotalCpuTime, tx0, offsetY, tw, INTERNAL_HEIGHT, "Max CPU " + FormatMCpu(maxCpu), Config.Palette.CpuMedium, Config.Palette.CpuLight); + } + offsetY += INTERNAL_HEIGHT; + p.PrintSvg(MaxTime, offsetY, background, canvas); + } + + svg << "" << Endl; + svg << "" + << "" << Endl; + svg << TString(background) << Endl; + + { + ui64 maxSec = MaxTime / 1000; + ui64 deltaSec = 0; + + if (maxSec <= 10) deltaSec = 1; + else if (maxSec <= 20) deltaSec = 2; + else if (maxSec <= 30) deltaSec = 3; + else if (maxSec <= 40) deltaSec = 4; + else if (maxSec <= 50) deltaSec = 5; + else if (maxSec <= 60) deltaSec = 6; + else if (maxSec <= 100) deltaSec = 10; + else if (maxSec <= 150) deltaSec = 15; + else if (maxSec <= 200) deltaSec = 20; + else if (maxSec <= 300) deltaSec = 30; + else if (maxSec <= 600) deltaSec = 60; + else if (maxSec <= 1200) deltaSec = 120; + else if (maxSec <= 1800) deltaSec = 180; + else if (maxSec <= 3600) deltaSec = 360; + else { + ui64 stepSec = maxSec / 10; + deltaSec = stepSec - (stepSec % 60); + } + + auto x = Config.HeaderWidth + GAP_X + Config.SummaryWidth + GAP_X; + auto w = Config.Width - x - INTERNAL_GAP_X * 2; + + for (ui64 t = 0; t < maxSec; t += deltaSec) { + ui64 x1 = t * w / maxSec; + svg + << "" << Endl; + auto timeLabel = Sprintf("%lu:%.2lu", t / 60, t % 60); + for (auto& p : Plans) { + svg + << "" + << timeLabel << "" << Endl; + } + } + } + + svg << TString(canvas) << Endl; + svg << "" << Endl; + + return svg; +} diff --git a/ydb/public/lib/ydb_cli/common/plan2svg.h b/ydb/public/lib/ydb_cli/common/plan2svg.h new file mode 100644 index 000000000000..c4115a05f355 --- /dev/null +++ b/ydb/public/lib/ydb_cli/common/plan2svg.h @@ -0,0 +1,264 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include + +#include +#include + +class TStage; + +class TSummaryMetric { + +public: + ui64 Value = 0; + ui32 Count = 0; + ui64 Min = 0; + ui64 Max = 0; + + void Add(ui64 value) { + if (Count) { + Min = std::min(Min, value); + Max = std::max(Max, value); + } else { + Min = value; + Max = value; + } + Value += value; + Count++; + } + + ui64 Average() { + return Count ? (Value / Count) : 0; + } +}; + +struct TAggregation { + ui64 Min = 0; + ui64 Max = 0; + ui64 Avg = 0; + ui64 Sum = 0; + ui32 Count = 0; + + bool Load(const NJson::TJsonValue& node); +}; + +struct TMetricHistory { + std::vector> Deriv; + ui64 MaxDeriv = 0; + std::vector> Values; + ui64 MaxValue = 0; + ui64 MinTime = 0; + ui64 MaxTime = 0; + + void Load(const NJson::TJsonValue& node, ui64 explicitMinTime, ui64 explicitMaxTime); + void Load(std::vector& times, std::vector& values, ui64 explicitMinTime, ui64 explicitMaxTime); +}; + +class TSingleMetric { + +public: + TSingleMetric(std::shared_ptr summary, const NJson::TJsonValue& node, + const NJson::TJsonValue* firstMessageNode = nullptr, + const NJson::TJsonValue* lastMessageNode = nullptr, + const NJson::TJsonValue* waitTimeUsNode = nullptr); + + std::shared_ptr Summary; + TAggregation Details; + + TMetricHistory History; + TMetricHistory WaitTime; + ui64 MinTime = 0; + ui64 MaxTime = 0; + TAggregation FirstMessage; + TAggregation LastMessage; +}; + +class TConnection { + +public: + TConnection(const TString& nodeType, ui32 stagePlanNodeId) : NodeType(nodeType), StagePlanNodeId(stagePlanNodeId) { + } + + TString NodeType; + std::shared_ptr FromStage; + std::shared_ptr InputBytes; + std::shared_ptr InputRows; + std::vector KeyColumns; + bool CteConnection = false; + ui32 CteIndentX = 0; + ui32 CteOffsetY = 0; + std::shared_ptr CteOutputBytes; + std::shared_ptr CteOutputRows; + const NJson::TJsonValue* StatsNode = nullptr; + const ui32 StagePlanNodeId; +}; + +class TSource { + +public: + TSource(const TString& nodeType) : NodeType(nodeType) { + } + + TString NodeType; + std::shared_ptr IngressBytes; + std::shared_ptr IngressRows; + std::vector Info; +}; + +class TStage { + +public: + TStage(const TString& nodeType) : NodeType(nodeType) { + } + + TString NodeType; + std::shared_ptr Source; + std::shared_ptr IngressBytes; + std::vector> Connections; + ui32 IndentX = 0; + ui32 IndentY = 0; + ui32 OffsetY = 0; + ui32 Height = 0; + std::shared_ptr CpuTime; + std::shared_ptr MaxMemoryUsage; + std::shared_ptr OutputBytes; + std::shared_ptr OutputRows; + std::shared_ptr SpillingComputeTime; + std::shared_ptr SpillingComputeBytes; + std::shared_ptr SpillingChannelTime; + std::shared_ptr SpillingChannelBytes; + std::vector Info; + ui64 BaseTime = 0; + ui32 PlanNodeId = 0; + ui32 PhysicalStageId = 0; + ui32 Tasks = 0; + const NJson::TJsonValue* StatsNode = nullptr; +}; + +struct TColorPalette { + TColorPalette(); + TString StageMain; + TString StageClone; + TString StageText; + TString StageTextHighlight; + TString StageGrid; + TString IngressDark; + TString IngressMedium; + TString IngressLight; + TString InputDark; + TString InputMedium; + TString InputLight; + TString OutputDark; + TString OutputMedium; + TString OutputLight; + TString MemMedium; + TString MemLight; + TString CpuMedium; + TString CpuLight; + TString ConnectionFill; + TString ConnectionLine; + TString ConnectionText; + TString MinMaxLine; + TString TextLight; + TString TextInverted; + TString TextSummary; + TString SpillingBytesDark; + TString SpillingBytesMedium; + TString SpillingBytesLight; + TString SpillingTimeDark; + TString SpillingTimeMedium; + TString SpillingTimeLight; +}; + +struct TPlanViewConfig { + TPlanViewConfig(); + ui32 HeaderWidth; + ui32 SummaryWidth; + ui32 Width; + TColorPalette Palette; +}; + +class TPlan { + +public: + TPlan(const TString& nodeType, TPlanViewConfig& config, std::map>& cteStages, + std::map& cteSubPlans) + : NodeType(nodeType), Config(config), CteStages(cteStages), CteSubPlans(cteSubPlans) { + CpuTime = std::make_shared(); + MaxMemoryUsage = std::make_shared(); + OutputBytes = std::make_shared(); + OutputRows = std::make_shared(); + InputBytes = std::make_shared(); + InputRows = std::make_shared(); + IngressBytes = std::make_shared(); + IngressRows = std::make_shared(); + SpillingComputeTime = std::make_shared(); + SpillingComputeBytes = std::make_shared(); + SpillingChannelTime = std::make_shared(); + SpillingChannelBytes = std::make_shared(); + } + + void Load(const NJson::TJsonValue& node); + void LoadStage(std::shared_ptr stage, const NJson::TJsonValue& node, ui32 parentPlanNodeId); + void LoadSource(std::shared_ptr source, const NJson::TJsonValue& node); + void MarkStageIndent(ui32 indentX, ui32& offsetY, std::shared_ptr stage); + void MarkLayout(); + void ResolveCteRefs(); + void PrintTimeline(TStringBuilder& background, TStringBuilder& canvas, const TString& title, TAggregation& firstMessage, TAggregation& lastMessage, ui32 x, ui32 y, ui32 w, ui32 h, const TString& color); + void PrintWaitTime(TStringBuilder& canvas, std::shared_ptr metric, ui32 x, ui32 y, ui32 w, ui32 h, const TString& fillColor); + void PrintDeriv(TStringBuilder& canvas, TMetricHistory& history, ui32 x, ui32 y, ui32 w, ui32 h, const TString& title, const TString& lineColor, const TString& fillColor = ""); + void PrintValues(TStringBuilder& canvas, std::shared_ptr metric, ui32 x, ui32 y, ui32 w, ui32 h, const TString& title, const TString& lineColor, const TString& fillColor = ""); + void PrintStageSummary(TStringBuilder& background, TStringBuilder&, ui32 y0, std::shared_ptr metric, const TString& mediumColor, const TString& lightColor, const TString& textSum, const TString& tooltip); + void PrintSvg(ui64 maxTime, ui32& offsetY, TStringBuilder& background, TStringBuilder& canvas); + TString NodeType; + std::vector> Stages; + std::shared_ptr CpuTime; + std::shared_ptr MaxMemoryUsage; + std::shared_ptr OutputBytes; + std::shared_ptr OutputRows; + std::shared_ptr InputBytes; + std::shared_ptr InputRows; + std::shared_ptr IngressBytes; + std::shared_ptr IngressRows; + std::shared_ptr SpillingComputeTime; + std::shared_ptr SpillingComputeBytes; + std::shared_ptr SpillingChannelTime; + std::shared_ptr SpillingChannelBytes; + std::vector TotalCpuTimes; + std::vector TotalCpuValues; + TMetricHistory TotalCpuTime; + ui64 MaxTime = 1000; + ui64 BaseTime = 0; + ui64 TimeOffset = 0; + ui32 OffsetY = 0; + ui32 Tasks = 0; + std::vector>> CteRefs; + std::vector, ui32>>> MemberRefs; + TPlanViewConfig& Config; + std::map>& CteStages; + std::map& CteSubPlans; +}; + +class TPlanVisualizer { + +public: + + void LoadPlans(const TString& plans); + void LoadPlan(const TString& planNodeType, const NJson::TJsonValue& root); + void PostProcessPlans(); + TString PrintSvg(); + TString PrintSvgSafe(); + + std::vector Plans; + ui64 MaxTime = 1000; + ui64 BaseTime = 0; + TPlanViewConfig Config; + std::map> CteStages; + std::map CteSubPlans; +}; diff --git a/ydb/public/lib/ydb_cli/common/ya.make b/ydb/public/lib/ydb_cli/common/ya.make index 533d768660f6..16c9c4835ee4 100644 --- a/ydb/public/lib/ydb_cli/common/ya.make +++ b/ydb/public/lib/ydb_cli/common/ya.make @@ -13,6 +13,7 @@ SRCS( parameter_stream.cpp parameters.cpp pg_dump_parser.cpp + plan2svg.cpp pretty_table.cpp print_operation.cpp print_utils.cpp diff --git a/ydb/tests/fq/plans/test_stats_mode.py b/ydb/tests/fq/plans/test_stats_mode.py index f8a7a62d3d85..7679b3ac6133 100644 --- a/ydb/tests/fq/plans/test_stats_mode.py +++ b/ydb/tests/fq/plans/test_stats_mode.py @@ -14,7 +14,7 @@ class TestStatsMode: @pytest.mark.parametrize( "stats_mode", ["STATS_MODE_NONE", "STATS_MODE_BASIC", "STATS_MODE_FULL", "STATS_MODE_PROFILE"] ) - def test_mode(self, kikimr, s3, client, stats_mode): + def test_mode(self, kikimr, s3, client, stats_mode, yq_version): resource = boto3.resource( "s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key" ) @@ -50,6 +50,10 @@ def test_mode(self, kikimr, s3, client, stats_mode): query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id client.wait_query_status(query_id, fq.QueryMeta.COMPLETED) + if yq_version == "v2": + result = client.describe_query(query_id).result + assert " list[str]: return [cli] class QueuePlan: - def __init__(self, plan: dict | None = None, table: str | None = None, ast: str | None = None) -> None: + def __init__(self, plan: dict | None = None, table: str | None = None, ast: str | None = None, svg: str | None = None) -> None: self.plan = plan self.table = table self.ast = ast + self.svg = svg class WorkloadRunResult: def __init__( @@ -92,6 +93,9 @@ def workload_run(type: WorkloadType, path: str, query_num: int, iterations: int if (os.path.exists(plan_path + '.ast')): with open(plan_path + '.ast') as f: plan.ast = f.read() + if (os.path.exists(plan_path + '.svg')): + with open(plan_path + '.svg') as f: + plan.svg = f.read() return YdbCliHelper.WorkloadRunResult( stats=stats, diff --git a/ydb/tests/olap/load/conftest.py b/ydb/tests/olap/load/conftest.py index f119887bd365..e2139f65638f 100644 --- a/ydb/tests/olap/load/conftest.py +++ b/ydb/tests/olap/load/conftest.py @@ -46,6 +46,8 @@ def _get_duraton(stats, field): allure.attach(result.plan.table, 'Plan table', attachment_type=allure.attachment_type.TEXT) if result.plan.ast is not None: allure.attach(result.plan.ast, 'Plan ast', attachment_type=allure.attachment_type.TEXT) + if result.plan.svg is not None: + allure.attach(result.plan.svg, 'Plan svg', attachment_type=allure.attachment_type.SVG) if result.stdout is not None: allure.attach(result.stdout, 'Stdout', attachment_type=allure.attachment_type.TEXT) diff --git a/ydb/tests/tools/fq_runner/kikimr_utils.py b/ydb/tests/tools/fq_runner/kikimr_utils.py index b3f86b84a3d8..9deb83e14262 100644 --- a/ydb/tests/tools/fq_runner/kikimr_utils.py +++ b/ydb/tests/tools/fq_runner/kikimr_utils.py @@ -114,6 +114,7 @@ def apply_to_kikimr(self, request, kikimr): solomon_endpoint = os.environ.get('SOLOMON_URL') if solomon_endpoint is not None: kikimr.compute_plane.fq_config['common']['monitoring_endpoint'] = solomon_endpoint + kikimr.control_plane.fq_config['common']['show_query_timeline'] = True class YQv2Extension(ExtensionPoint): diff --git a/ydb/tests/tools/kqprun/src/kqp_runner.cpp b/ydb/tests/tools/kqprun/src/kqp_runner.cpp index 399025dcee25..fdcaefc4901f 100644 --- a/ydb/tests/tools/kqprun/src/kqp_runner.cpp +++ b/ydb/tests/tools/kqprun/src/kqp_runner.cpp @@ -322,7 +322,7 @@ class TKqpRunner::TImpl { try { double cpuUsage = 0.0; - auto fullStat = StatProcessor_->GetQueryStat(convertedPlan, cpuUsage); + auto fullStat = StatProcessor_->GetQueryStat(convertedPlan, cpuUsage, nullptr, 0); auto flatStat = StatProcessor_->GetFlatStat(convertedPlan); auto publicStat = StatProcessor_->GetPublicStat(fullStat); From 1ec7315a105ed0e54b558265b0dc05c8f6fe8bc0 Mon Sep 17 00:00:00 2001 From: Ivan Sukhov Date: Tue, 24 Sep 2024 12:57:11 +0300 Subject: [PATCH 32/56] Fixed queries returning unspecified status (#9371) (#9639) --- ydb/library/yql/dq/actors/dq.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ydb/library/yql/dq/actors/dq.cpp b/ydb/library/yql/dq/actors/dq.cpp index bda08348dd47..3aa472cc7030 100644 --- a/ydb/library/yql/dq/actors/dq.cpp +++ b/ydb/library/yql/dq/actors/dq.cpp @@ -30,6 +30,8 @@ Ydb::StatusIds::StatusCode DqStatusToYdbStatus(NYql::NDqProto::StatusIds::Status return Ydb::StatusIds::EXTERNAL_ERROR; case NYql::NDqProto::StatusIds::SCHEME_ERROR: return Ydb::StatusIds::SCHEME_ERROR; + case NYql::NDqProto::StatusIds::UNSUPPORTED: + return Ydb::StatusIds::UNSUPPORTED; case NYql::NDqProto::StatusIds::GENERIC_ERROR: default: return Ydb::StatusIds::GENERIC_ERROR; @@ -63,12 +65,13 @@ NYql::NDqProto::StatusIds::StatusCode YdbStatusToDqStatus(Ydb::StatusIds::Status return NYql::NDqProto::StatusIds::CANCELLED; case Ydb::StatusIds::SCHEME_ERROR: return NYql::NDqProto::StatusIds::SCHEME_ERROR; - case Ydb::StatusIds::GENERIC_ERROR: - return NYql::NDqProto::StatusIds::GENERIC_ERROR; + case Ydb::StatusIds::UNSUPPORTED: + return NYql::NDqProto::StatusIds::UNSUPPORTED; case Ydb::StatusIds::EXTERNAL_ERROR: return NYql::NDqProto::StatusIds::EXTERNAL_ERROR; + case Ydb::StatusIds::GENERIC_ERROR: default: - return NYql::NDqProto::StatusIds::UNSPECIFIED; + return NYql::NDqProto::StatusIds::GENERIC_ERROR; } } From 67e35be14fcac264224d99a7dd90e2cfe364bb60 Mon Sep 17 00:00:00 2001 From: yumkam Date: Wed, 25 Sep 2024 17:51:32 +0300 Subject: [PATCH 33/56] Add checkpoint support for streamlookup (backport #9299) (#9719) --- .../dq/actors/compute/dq_compute_actor_impl.h | 21 ++++++++++++------- .../dq_input_transform_lookup.cpp | 2 +- .../task_runner/task_runner_actor_local.cpp | 14 +++++++++++++ ydb/library/yql/dq/runtime/dq_async_input.cpp | 7 ++++++- ydb/library/yql/dq/runtime/dq_async_input.h | 2 ++ 5 files changed, 36 insertions(+), 10 deletions(-) diff --git a/ydb/library/yql/dq/actors/compute/dq_compute_actor_impl.h b/ydb/library/yql/dq/actors/compute/dq_compute_actor_impl.h index e87acb1f7b7f..bcc9cc89784b 100644 --- a/ydb/library/yql/dq/actors/compute/dq_compute_actor_impl.h +++ b/ydb/library/yql/dq/actors/compute/dq_compute_actor_impl.h @@ -1369,8 +1369,20 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped } void PollAsyncInput() { + if (!Running) { + CA_LOG_T("Skip polling inputs and sources because not running"); + return; + } + + CA_LOG_T("Poll inputs"); + for (auto& [inputIndex, transform] : InputTransformsMap) { + if (auto resume = transform.PollAsyncInput(MetricsReporter, WatermarksTracker, RuntimeSettings.AsyncInputPushLimit)) { + ContinueExecute(*resume); + } + } + // Don't produce any input from sources if we're about to save checkpoint. - if (!Running || (Checkpoints && Checkpoints->HasPendingCheckpoint() && !Checkpoints->ComputeActorStateSaved())) { + if ((Checkpoints && Checkpoints->HasPendingCheckpoint() && !Checkpoints->ComputeActorStateSaved())) { CA_LOG_T("Skip polling sources because of pending checkpoint"); return; } @@ -1381,13 +1393,6 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped ContinueExecute(*resume); } } - - CA_LOG_T("Poll inputs"); - for (auto& [inputIndex, transform] : InputTransformsMap) { - if (auto resume = transform.PollAsyncInput(MetricsReporter, WatermarksTracker, RuntimeSettings.AsyncInputPushLimit)) { - ContinueExecute(*resume); - } - } } void OnNewAsyncInputDataArrived(const IDqComputeActorAsyncInput::TEvNewAsyncInputDataArrived::TPtr& ev) { diff --git a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp index 77b53fff73fa..23f9cd0b702e 100644 --- a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp +++ b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp @@ -184,7 +184,7 @@ class TInputTransformStreamLookupBase } } finished = IsFinished(); - return 0; + return AwaitingQueue.RowCount(); } TMaybe ExtraData() override { diff --git a/ydb/library/yql/dq/actors/task_runner/task_runner_actor_local.cpp b/ydb/library/yql/dq/actors/task_runner/task_runner_actor_local.cpp index 33cb50221d17..27cef00055d1 100644 --- a/ydb/library/yql/dq/actors/task_runner/task_runner_actor_local.cpp +++ b/ydb/library/yql/dq/actors/task_runner/task_runner_actor_local.cpp @@ -143,6 +143,18 @@ class TLocalTaskRunnerActor return false; } } + for (const auto transformId: InputTransforms) { + const auto t = TaskRunner->GetInputTransform(transformId); + if (t) { + auto [_, transform] = *t; + if (!transform->Empty()) { + return false; + } + if (transform->IsPending()) { + return false; + } + } + } return true; } @@ -436,6 +448,7 @@ class TLocalTaskRunnerActor for (auto i = 0; i != inputs.size(); ++i) { if (auto t = TaskRunner->GetInputTransform(i)) { inputTransforms[i] = *t; + InputTransforms.emplace(i); } } @@ -488,6 +501,7 @@ class TLocalTaskRunnerActor const TTxId TxId; const ui64 TaskId; THashSet Inputs; + THashSet InputTransforms; THashSet Sources; TIntrusivePtr TaskRunner; THashSet InputChannelsWithDisabledCheckpoints; diff --git a/ydb/library/yql/dq/runtime/dq_async_input.cpp b/ydb/library/yql/dq/runtime/dq_async_input.cpp index 7d515e5cb31e..9f5c1704813c 100644 --- a/ydb/library/yql/dq/runtime/dq_async_input.cpp +++ b/ydb/library/yql/dq/runtime/dq_async_input.cpp @@ -6,6 +6,7 @@ namespace NYql::NDq { class TDqAsyncInputBuffer : public TDqInputImpl { using TBaseImpl = TDqInputImpl; friend TBaseImpl; + bool Pending = false; public: TDqAsyncInputBufferStats PushStats; TDqInputStats PopStats; @@ -32,7 +33,7 @@ class TDqAsyncInputBuffer : public TDqInputImpl Date: Fri, 27 Sep 2024 09:49:03 +0300 Subject: [PATCH 34/56] Add streamlookup lru cache backport (#9763) --- .../dq_input_transform_lookup.cpp | 263 +++++++++++------- ydb/library/yql/dq/proto/dq_tasks.proto | 3 +- .../mkql_key_payload_value_lru_cache.h | 17 +- .../dq/planner/execution_planner.cpp | 2 + ydb/tests/fq/generic/test_streaming_join.py | 89 +++++- ydb/tests/fq/generic/ya.make | 2 + ydb/tests/fq/generic/ydb/01_basic.sh | 6 + 7 files changed, 277 insertions(+), 105 deletions(-) diff --git a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp index 23f9cd0b702e..98aabb3a9314 100644 --- a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp +++ b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp @@ -1,14 +1,18 @@ #include "dq_input_transform_lookup.h" + +#include #include #include #include -#include -#include #include #include -#include +#include + +#include #include +#include + namespace NYql::NDq { namespace { @@ -31,13 +35,15 @@ class TInputTransformStreamLookupBase NActors::TActorId computeActorId, IDqAsyncIoFactory* factory, NDqProto::TDqInputTransformLookupSettings&& settings, - TVector&& inputJoinColumns, - TVector&& lookupJoinColumns, + TVector&& lookupInputIndexes, + TVector&& otherInputIndexes, const NMiniKQL::TMultiType* inputRowType, const NMiniKQL::TStructType* lookupKeyType, const NMiniKQL::TStructType* lookupPayloadType, const NMiniKQL::TMultiType* outputRowType, - const TOutputRowColumnOrder& outputRowColumnOrder + TOutputRowColumnOrder&& outputRowColumnOrder, + size_t cacheLimit, + std::chrono::seconds cacheTtl ) : Alloc(alloc) , HolderFactory(holderFactory) @@ -47,20 +53,28 @@ class TInputTransformStreamLookupBase , ComputeActorId(std::move(computeActorId)) , Factory(factory) , Settings(std::move(settings)) - , InputJoinColumns(std::move(inputJoinColumns)) - , LookupJoinColumns(std::move(lookupJoinColumns)) + , LookupInputIndexes(std::move(lookupInputIndexes)) + , OtherInputIndexes(std::move(otherInputIndexes)) , InputRowType(inputRowType) , LookupKeyType(lookupKeyType) , KeyTypeHelper(std::make_shared(lookupKeyType)) , LookupPayloadType(lookupPayloadType) , OutputRowType(outputRowType) - , OutputRowColumnOrder(outputRowColumnOrder) + , OutputRowColumnOrder(std::move(outputRowColumnOrder)) , InputFlowFetchStatus(NUdf::EFetchStatus::Yield) - , AwaitingQueue(InputRowType) + , LruCache(std::make_unique(cacheLimit, lookupKeyType)) + , CacheTtl(cacheTtl) , ReadyQueue(OutputRowType) , WaitingForLookupResults(false) { Y_ABORT_UNLESS(Alloc); + for (size_t i = 0; i != LookupInputIndexes.size(); ++i) { + Y_DEBUG_ABORT_UNLESS(LookupInputIndexes[i] < InputRowType->GetElementsCount()); + } + for (size_t i = 0; i != OtherInputIndexes.size(); ++i) { + Y_DEBUG_ABORT_UNLESS(OtherInputIndexes[i] < InputRowType->GetElementsCount()); + } + Y_DEBUG_ABORT_UNLESS(LookupInputIndexes.size() == LookupKeyType->GetMembersCount()); } void Bootstrap() { @@ -89,26 +103,17 @@ class TInputTransformStreamLookupBase hFunc(IDqAsyncLookupSource::TEvLookupResult, Handle); ) - void Handle(IDqAsyncLookupSource::TEvLookupResult::TPtr ev) { - auto guard = BindAllocator(); - const auto lookupResult = std::move(ev->Get()->Result); - while (!AwaitingQueue.empty()) { - const auto wideInputRow = AwaitingQueue.Head(); - NUdf::TUnboxedValue* keyItems; - NUdf::TUnboxedValue lookupKey = HolderFactory.CreateDirectArrayHolder(InputJoinColumns.size(), keyItems); - for (size_t i = 0; i != InputJoinColumns.size(); ++i) { - keyItems[i] = wideInputRow[InputJoinColumns[i]]; - } - auto lookupPayload = lookupResult.FindPtr(lookupKey); - + void AddReadyQueue(NUdf::TUnboxedValue& lookupKey, NUdf::TUnboxedValue& inputOther, NUdf::TUnboxedValue *lookupPayload) { NUdf::TUnboxedValue* outputRowItems; NUdf::TUnboxedValue outputRow = HolderFactory.CreateDirectArrayHolder(OutputRowColumnOrder.size(), outputRowItems); for (size_t i = 0; i != OutputRowColumnOrder.size(); ++i) { const auto& [source, index] = OutputRowColumnOrder[i]; - switch(source) { + switch (source) { case EOutputRowItemSource::InputKey: + outputRowItems[i] = lookupKey.GetElement(index); + break; case EOutputRowItemSource::InputOther: - outputRowItems[i] = wideInputRow[index]; + outputRowItems[i] = inputOther.GetElement(index); break; case EOutputRowItemSource::LookupKey: outputRowItems[i] = lookupPayload && *lookupPayload ? lookupKey.GetElement(index) : NUdf::TUnboxedValue {}; @@ -123,8 +128,23 @@ class TInputTransformStreamLookupBase break; } } - AwaitingQueue.Pop(); ReadyQueue.PushRow(outputRowItems, OutputRowType->GetElementsCount()); + } + + void Handle(IDqAsyncLookupSource::TEvLookupResult::TPtr ev) { + auto guard = BindAllocator(); + const auto now = std::chrono::steady_clock::now(); + auto lookupResult = std::move(ev->Get()->Result); + for (; !AwaitingQueue.empty(); AwaitingQueue.pop_front()) { + auto& [lookupKey, inputOther] = AwaitingQueue.front(); + auto lookupPayload = lookupResult.FindPtr(lookupKey); + if (lookupPayload == nullptr) { + continue; + } + AddReadyQueue(lookupKey, inputOther, lookupPayload); + } + for (auto&& [k, v]: lookupResult) { + LruCache->Update(NUdf::TUnboxedValue(const_cast(k)), std::move(v), now + CacheTtl); } WaitingForLookupResults = false; Send(ComputeActorId, new TEvNewAsyncInputDataArrived{InputIndex}); @@ -150,41 +170,59 @@ class TInputTransformStreamLookupBase //All resources, held by this class, that have been created with mkql allocator, must be deallocated here InputFlow.Clear(); KeyTypeHelper.reset(); - NMiniKQL::TUnboxedValueBatch{}.swap(AwaitingQueue); - NMiniKQL::TUnboxedValueBatch{}.swap(ReadyQueue); + decltype(AwaitingQueue){}.swap(AwaitingQueue); + decltype(ReadyQueue){}.swap(ReadyQueue); + LruCache.reset(); } - i64 GetAsyncInputData(NKikimr::NMiniKQL::TUnboxedValueBatch& batch, TMaybe&, bool& finished, i64 freeSpace) final { - Y_UNUSED(freeSpace); - auto guard = BindAllocator(); + void DrainReadyQueue(NKikimr::NMiniKQL::TUnboxedValueBatch& batch) { while (!ReadyQueue.empty()) { PushOutputValue(batch, ReadyQueue.Head()); ReadyQueue.Pop(); } + } + + i64 GetAsyncInputData(NKikimr::NMiniKQL::TUnboxedValueBatch& batch, TMaybe&, bool& finished, i64 freeSpace) final { + Y_UNUSED(freeSpace); + auto guard = BindAllocator(); + + DrainReadyQueue(batch); if (InputFlowFetchStatus != NUdf::EFetchStatus::Finish && !WaitingForLookupResults) { - NUdf::TUnboxedValue* inputRowItems; - NUdf::TUnboxedValue inputRow = HolderFactory.CreateDirectArrayHolder(InputRowType->GetElementsCount(), inputRowItems); + NUdf::TUnboxedValue* inputRowItems; + NUdf::TUnboxedValue inputRow = HolderFactory.CreateDirectArrayHolder(InputRowType->GetElementsCount(), inputRowItems); + const auto now = std::chrono::steady_clock::now(); const auto maxKeysInRequest = LookupSource.first->GetMaxSupportedKeysInRequest(); IDqAsyncLookupSource::TUnboxedValueMap keysForLookup{maxKeysInRequest, KeyTypeHelper->GetValueHash(), KeyTypeHelper->GetValueEqual()}; + LruCache->Prune(now); while ( (keysForLookup.size() < maxKeysInRequest) && ((InputFlowFetchStatus = FetchWideInputValue(inputRowItems)) == NUdf::EFetchStatus::Ok)) { NUdf::TUnboxedValue* keyItems; - NUdf::TUnboxedValue key = HolderFactory.CreateDirectArrayHolder(InputJoinColumns.size(), keyItems); - for (size_t i = 0; i != InputJoinColumns.size(); ++i) { - keyItems[i] = inputRowItems[InputJoinColumns[i]]; - } - keysForLookup.emplace(std::move(key), NUdf::TUnboxedValue{}); - AwaitingQueue.PushRow(inputRowItems, InputRowType->GetElementsCount()); + NUdf::TUnboxedValue key = HolderFactory.CreateDirectArrayHolder(LookupInputIndexes.size(), keyItems); + NUdf::TUnboxedValue* otherItems; + NUdf::TUnboxedValue other = HolderFactory.CreateDirectArrayHolder(OtherInputIndexes.size(), otherItems); + for (size_t i = 0; i != LookupInputIndexes.size(); ++i) { + keyItems[i] = inputRowItems[LookupInputIndexes[i]]; + } + for (size_t i = 0; i != OtherInputIndexes.size(); ++i) { + otherItems[i] = inputRowItems[OtherInputIndexes[i]]; + } + if (auto lookupPayload = LruCache->Get(key, now)) { + AddReadyQueue(key, other, &*lookupPayload); + } else { + AwaitingQueue.emplace_back(key, std::move(other)); + keysForLookup.emplace(std::move(key), NUdf::TUnboxedValue{}); + } } if (!keysForLookup.empty()) { LookupSource.first->AsyncLookup(std::move(keysForLookup)); WaitingForLookupResults = true; } + DrainReadyQueue(batch); } finished = IsFinished(); - return AwaitingQueue.RowCount(); + return AwaitingQueue.size(); } TMaybe ExtraData() override { @@ -217,13 +255,13 @@ class TInputTransformStreamLookupBase const NMiniKQL::TTypeEnvironment& TypeEnv; ui64 InputIndex; // NYql::NDq::IDqComputeActorAsyncInput NUdf::TUnboxedValue InputFlow; - const NActors::TActorId ComputeActorId; + const NActors::TActorId ComputeActorId; IDqAsyncIoFactory::TPtr Factory; NDqProto::TDqInputTransformLookupSettings Settings; protected: std::pair LookupSource; - const TVector InputJoinColumns; - const TVector LookupJoinColumns; + const TVector LookupInputIndexes; + const TVector OtherInputIndexes; const NMiniKQL::TMultiType* const InputRowType; const NMiniKQL::TStructType* const LookupKeyType; //key column types in LookupTable std::shared_ptr KeyTypeHelper; @@ -232,7 +270,11 @@ class TInputTransformStreamLookupBase const TOutputRowColumnOrder OutputRowColumnOrder; NUdf::EFetchStatus InputFlowFetchStatus; - NKikimr::NMiniKQL::TUnboxedValueBatch AwaitingQueue; + std::unique_ptr LruCache; + std::chrono::seconds CacheTtl; + using TInputKeyOtherPair = std::pair; + using TAwaitingQueue = std::deque>; //input row split in two parts: key columns and other columns + TAwaitingQueue AwaitingQueue; NKikimr::NMiniKQL::TUnboxedValueBatch ReadyQueue; std::atomic WaitingForLookupResults; NYql::NDq::TDqAsyncStats IngressStats; @@ -278,10 +320,10 @@ class TInputTransformStreamLookupNarrow: public TInputTransformStreamLookupBase std::pair< - const NMiniKQL::TStructType*, //lookup key, may contain several columns + const NMiniKQL::TStructType*, //lookup key, may contain several columns const NMiniKQL::TStructType* //lookup result(payload) the rest columns > SplitLookupTableColumns( - const NMiniKQL::TStructType* rowType, + const NMiniKQL::TStructType* rowType, const THashMap& keyColumns, const NMiniKQL::TTypeEnvironment& typeEnv ) { @@ -339,80 +381,94 @@ std::tuple Deser } } -TOutputRowColumnOrder CategorizeOutputRowItems( +std::pair< + TOutputRowColumnOrder, + TVector +> CategorizeOutputRowItems( const NMiniKQL::TStructType* type, TStringBuf leftLabel, TStringBuf rightLabel, - const THashSet& leftJoinColumns, - const THashSet& rightJoinColumns) + const auto& rightNames, + const THashMap& leftJoinColumns, + const THashMap& lookupKeyColumns, + const THashMap& lookupPayloadColumns, + const THashMap& inputColumns +) { TOutputRowColumnOrder result(type->GetMembersCount()); - size_t idxLeft = 0; - size_t idxRightKey = 0; - size_t idxRightPayload = 0; + TVector otherInputIndexes; for (ui32 i = 0; i != type->GetMembersCount(); ++i) { const auto prefixedName = type->GetMemberName(i); if (prefixedName.starts_with(leftLabel) && prefixedName.length() > leftLabel.length() && prefixedName[leftLabel.length()] == '.') { const auto name = prefixedName.SubStr(leftLabel.length() + 1); //skip prefix and dot - result[i] = { - leftJoinColumns.contains(name) ? EOutputRowItemSource::InputKey : EOutputRowItemSource::InputOther, - idxLeft++ - }; + if (auto j = leftJoinColumns.FindPtr(name)) { + result[i] = { EOutputRowItemSource::InputKey, lookupKeyColumns.at(rightNames[*j]) }; + } else { + result[i] = { EOutputRowItemSource::InputOther, otherInputIndexes.size() }; + otherInputIndexes.push_back(inputColumns.at(name)); + } } else if (prefixedName.starts_with(rightLabel) && prefixedName.length() > rightLabel.length() && prefixedName[rightLabel.length()] == '.') { const auto name = prefixedName.SubStr(rightLabel.length() + 1); //skip prefix and dot - //presume that indexes in LookupKey, LookupOther has the same relative position as in OutputRow - if (rightJoinColumns.contains(name)) { - result[i] = {EOutputRowItemSource::LookupKey, idxRightKey++}; + if (auto j = lookupKeyColumns.FindPtr(name)) { + result[i] = { EOutputRowItemSource::LookupKey, *j }; } else { - result[i] = {EOutputRowItemSource::LookupOther, idxRightPayload++}; + result[i] = { EOutputRowItemSource::LookupOther, lookupPayloadColumns.at(name) }; } } else { Y_ABORT(); } } - return result; + return { std::move(result), std::move(otherInputIndexes) }; } -THashMap GetNameToIndex(const ::google::protobuf::RepeatedPtrField& names) { +template +THashMap GetNameToIndex(TIndex size, TGetter&& getter) { THashMap result; - for (int i = 0; i != names.size(); ++i) { - result[names[i]] = i; + for (TIndex i = 0; i != size; ++i) { + result[getter(i)] = i; } return result; } +THashMap GetNameToIndex(const ::google::protobuf::RepeatedPtrField& names) { + return GetNameToIndex(names.size(), [&names](auto idx) { + return names[idx]; + }); +} + THashMap GetNameToIndex(const NMiniKQL::TStructType* type) { - THashMap result; - for (ui32 i = 0; i != type->GetMembersCount(); ++i) { - result[type->GetMemberName(i)] = i; - } - return result; + return GetNameToIndex(type->GetMembersCount(), [type](auto idx) { + return type->GetMemberName(idx); + }); } -TVector GetJoinColumnIndexes(const ::google::protobuf::RepeatedPtrField& names, const THashMap& joinColumns) { +template +TVector GetJoinColumnIndexes(TIndex size, TGetter&& getter, const THashMap& joinColumns) { TVector result; - result.reserve(joinColumns.size()); - for (int i = 0; i != names.size(); ++i) { - if (auto p = joinColumns.FindPtr(names[i])) { + result.reserve(size); + for (TIndex i = 0; i != size; ++i) { + if (auto p = joinColumns.FindPtr(getter(i))) { result.push_back(*p); } } return result; } +[[maybe_unused]] +TVector GetJoinColumnIndexes(const ::google::protobuf::RepeatedPtrField& names, const THashMap& joinColumns) { + return GetJoinColumnIndexes(names.size(), [&names](auto idx) { + return names[idx]; + }, joinColumns); +} + TVector GetJoinColumnIndexes(const NMiniKQL::TStructType* type, const THashMap& joinColumns) { - TVector result; - result.reserve(joinColumns.size()); - for (ui32 i = 0; i != type->GetMembersCount(); ++i) { - if (auto p = joinColumns.FindPtr(type->GetMemberName(i))) { - result.push_back(*p); - } - } - return result; + return GetJoinColumnIndexes(type->GetMembersCount(), [type](auto idx) { + return type->GetMemberName(idx); + }, joinColumns); } } // namespace @@ -420,7 +476,7 @@ TVector GetJoinColumnIndexes(const NMiniKQL::TStructType* type, const TH std::pair CreateInputTransformStreamLookup( IDqAsyncIoFactory* factory, NDqProto::TDqInputTransformLookupSettings&& settings, - IDqAsyncIoFactory::TInputTransformArguments&& args //TODO expand me + IDqAsyncIoFactory::TInputTransformArguments&& args ) { const auto narrowInputRowType = DeserializeStructType(settings.GetNarrowInputRowType(), args.TypeEnv); @@ -432,22 +488,33 @@ std::pair CreateInputTransformStre const auto rightRowType = DeserializeStructType(settings.GetRightSource().GetSerializedRowType(), args.TypeEnv); auto inputColumns = GetNameToIndex(narrowInputRowType); + auto leftJoinColumns = GetNameToIndex(settings.GetLeftJoinKeyNames()); auto rightJoinColumns = GetNameToIndex(settings.GetRightJoinKeyNames()); - auto leftJoinColumnIndexes = GetJoinColumnIndexes( - settings.GetLeftJoinKeyNames(), - inputColumns); auto rightJoinColumnIndexes = GetJoinColumnIndexes(rightRowType, rightJoinColumns); Y_ABORT_UNLESS(rightJoinColumnIndexes.size() == rightJoinColumns.size()); - Y_ABORT_UNLESS(leftJoinColumnIndexes.size() == rightJoinColumnIndexes.size()); - - const auto& [lookupKeyType, lookupPayloadType] = SplitLookupTableColumns(rightRowType, rightJoinColumns, args.TypeEnv); - const auto& outputColumnsOrder = CategorizeOutputRowItems( + + auto&& [lookupKeyType, lookupPayloadType] = SplitLookupTableColumns(rightRowType, rightJoinColumns, args.TypeEnv); + + auto lookupKeyColumns = GetNameToIndex(lookupKeyType); + auto lookupPayloadColumns = GetNameToIndex(lookupPayloadType); + + auto lookupKeyInputIndexes = GetJoinColumnIndexes( + lookupKeyType->GetMembersCount(), + [&leftJoinKeyNames = settings.GetLeftJoinKeyNames(), + &rightJoinColumns, &lookupKeyType = lookupKeyType](auto idx) { + return leftJoinKeyNames[rightJoinColumns.at(lookupKeyType->GetMemberName(idx))]; + }, inputColumns); + + auto&& [outputColumnsOrder, otherInputIndexes] = CategorizeOutputRowItems( narrowOutputRowType, settings.GetLeftLabel(), settings.GetRightLabel(), - {settings.GetLeftJoinKeyNames().cbegin(), settings.GetLeftJoinKeyNames().cend()}, - {settings.GetRightJoinKeyNames().cbegin(), settings.GetRightJoinKeyNames().cend()} + settings.GetRightJoinKeyNames(), + leftJoinColumns, + lookupKeyColumns, + lookupPayloadColumns, + inputColumns ); auto actor = isWide ? (TInputTransformStreamLookupBase*)new TInputTransformStreamLookupWide( @@ -459,13 +526,15 @@ std::pair CreateInputTransformStre args.ComputeActorId, factory, std::move(settings), - std::move(leftJoinColumnIndexes), - std::move(rightJoinColumnIndexes), + std::move(lookupKeyInputIndexes), + std::move(otherInputIndexes), inputRowType, lookupKeyType, lookupPayloadType, outputRowType, - outputColumnsOrder + std::move(outputColumnsOrder), + settings.GetCacheLimit(), + std::chrono::seconds(settings.GetCacheTtlSeconds()) ) : (TInputTransformStreamLookupBase*)new TInputTransformStreamLookupNarrow( args.Alloc, @@ -476,13 +545,15 @@ std::pair CreateInputTransformStre args.ComputeActorId, factory, std::move(settings), - std::move(leftJoinColumnIndexes), - std::move(rightJoinColumnIndexes), + std::move(lookupKeyInputIndexes), + std::move(otherInputIndexes), inputRowType, lookupKeyType, lookupPayloadType, outputRowType, - outputColumnsOrder + std::move(outputColumnsOrder), + settings.GetCacheLimit(), + std::chrono::seconds(settings.GetCacheTtlSeconds()) ); return {actor, actor}; } diff --git a/ydb/library/yql/dq/proto/dq_tasks.proto b/ydb/library/yql/dq/proto/dq_tasks.proto index 318cf95d72e6..da8bdf36122e 100644 --- a/ydb/library/yql/dq/proto/dq_tasks.proto +++ b/ydb/library/yql/dq/proto/dq_tasks.proto @@ -184,7 +184,8 @@ message TDqInputTransformLookupSettings { repeated string RightJoinKeyNames = 6; //Join column names in the right hand, in the same order as previous bytes NarrowInputRowType = 7; //Serialized struct type bytes NarrowOutputRowType = 8; //Serialized struct type - //TODO add lookup cache parameters + uint64 CacheLimit = 9; + uint64 CacheTtlSeconds = 10; } message TDqTask { diff --git a/ydb/library/yql/minikql/computation/mkql_key_payload_value_lru_cache.h b/ydb/library/yql/minikql/computation/mkql_key_payload_value_lru_cache.h index 854ad8b5a7fc..dc9b77e8ed91 100644 --- a/ydb/library/yql/minikql/computation/mkql_key_payload_value_lru_cache.h +++ b/ydb/library/yql/minikql/computation/mkql_key_payload_value_lru_cache.h @@ -19,7 +19,7 @@ namespace NKikimr::NMiniKQL { // Never requests system time, expects monotonically increased time points in methods argument class TUnboxedKeyValueLruCacheWithTtl { struct TEntry { - TEntry(NUdf::TUnboxedValue key, NUdf::TUnboxedValue value, std::chrono::time_point expiration) + TEntry(NUdf::TUnboxedValue key, NUdf::TUnboxedValue value, std::chrono::time_point expiration) : Key(std::move(key)) , Value(std::move(value)) , Expiration(std::move(expiration)) @@ -73,16 +73,23 @@ class TUnboxedKeyValueLruCacheWithTtl { return std::nullopt; } - // Perform garbage collection. + // Perform garbage collection, single step, O(1) time. // Must be called periodically - void Tick(const std::chrono::time_point& now) { + bool Tick(const std::chrono::time_point& now) { if (UsageList.empty()) { - return; + return false; } if (now < UsageList.front().Expiration) { - return; + return false; } RemoveLeastRecentlyUsedEntry(); + return true; + } + + // Perform garbage collection, O(1) amortized, but O(n) one-time + void Prune(const std::chrono::time_point& now) { + while (Tick(now)) { + } } size_t Size() const { diff --git a/ydb/library/yql/providers/dq/planner/execution_planner.cpp b/ydb/library/yql/providers/dq/planner/execution_planner.cpp index bfe95edc78e8..69507575a78b 100644 --- a/ydb/library/yql/providers/dq/planner/execution_planner.cpp +++ b/ydb/library/yql/providers/dq/planner/execution_planner.cpp @@ -604,6 +604,8 @@ namespace NYql::NDqs { const auto narrowOutputRowType = GetSeqItemType(streamLookup.Ptr()->GetTypeAnn()); Y_ABORT_UNLESS(narrowOutputRowType->GetKind() == ETypeAnnotationKind::Struct); settings.SetNarrowOutputRowType(NYql::NCommon::GetSerializedTypeAnnotation(narrowOutputRowType)); + settings.SetCacheLimit(1'000'000); //TODO configure me + settings.SetCacheTtlSeconds(60); //TODO configure me const auto inputRowType = GetSeqItemType(streamLookup.Output().Stage().Program().Ref().GetTypeAnn()); const auto outputRowType = GetSeqItemType(stage.Program().Args().Arg(inputIndex).Ref().GetTypeAnn()); diff --git a/ydb/tests/fq/generic/test_streaming_join.py b/ydb/tests/fq/generic/test_streaming_join.py index 16269e2129cf..e5018d23b1c6 100644 --- a/ydb/tests/fq/generic/test_streaming_join.py +++ b/ydb/tests/fq/generic/test_streaming_join.py @@ -15,14 +15,15 @@ DEBUG = 0 -def ResequenceId(messages): +def ResequenceId(messages, field="id"): res = [] i = 1 for pair in messages: rpair = [] for it in pair: src = json.loads(it) - src["id"] = i + if field in src: + src[field] = i rpair += [json.dumps(src)] res += [tuple(rpair)] i += 1 @@ -310,6 +311,88 @@ def freeze(json): ), ], ), + # 6 + ( + R''' + $input = SELECT * FROM myyds.`{input_topic}` + WITH ( + FORMAT=json_each_row, + SCHEMA ( + za Int32, + yb STRING, + yc Int32, + zd Int32, + ) + ) ; + + $enriched = select a, b, c, d, e, f, za, yb, yc, zd + from + $input as e + left join {streamlookup} ydb_conn_{table_name}.db as u + on(e.yb = u.b AND e.za = u.a ) + ; + + insert into myyds.`{output_topic}` + select Unwrap(Yson::SerializeJson(Yson::From(TableRow()))) from $enriched; + ''', + ResequenceId( + [ + ( + '{"id":1,"za":1,"yb":"2","yc":100,"zd":101}', + '{"a":1,"b":"2","c":3,"d":4,"e":5,"f":6,"za":1,"yb":"2","yc":100,"zd":101}', + ), + ( + '{"id":2,"za":7,"yb":"8","yc":106,"zd":107}', + '{"a":7,"b":"8","c":9,"d":10,"e":11,"f":12,"za":7,"yb":"8","yc":106,"zd":107}', + ), + ( + '{"id":3,"za":2,"yb":"1","yc":114,"zd":115}', + '{"a":null,"b":null,"c":null,"d":null,"e":null,"f":null,"za":2,"yb":"1","yc":114,"zd":115}', + ), + ] + ), + ), + # 7 + ( + R''' + $input = SELECT * FROM myyds.`{input_topic}` + WITH ( + FORMAT=json_each_row, + SCHEMA ( + za Int32, + yb STRING, + yc Int32, + zd Int32, + ) + ) ; + + $enriched = select a, b, c, d, e, f, za, yb, yc, zd + from + $input as e + left join {streamlookup} ydb_conn_{table_name}.db as u + on(e.za = u.a AND e.yb = u.b) + ; + + insert into myyds.`{output_topic}` + select Unwrap(Yson::SerializeJson(Yson::From(TableRow()))) from $enriched; + ''', + ResequenceId( + [ + ( + '{"id":1,"za":1,"yb":"2","yc":100,"zd":101}', + '{"a":1,"b":"2","c":3,"d":4,"e":5,"f":6,"za":1,"yb":"2","yc":100,"zd":101}', + ), + ( + '{"id":2,"za":7,"yb":"8","yc":106,"zd":107}', + '{"a":7,"b":"8","c":9,"d":10,"e":11,"f":12,"za":7,"yb":"8","yc":106,"zd":107}', + ), + ( + '{"id":3,"za":2,"yb":"1","yc":114,"zd":115}', + '{"a":null,"b":null,"c":null,"d":null,"e":null,"f":null,"za":2,"yb":"1","yc":114,"zd":115}', + ), + ] + ), + ), ] @@ -367,7 +450,7 @@ def test_simple(self, kikimr, fq_client: FederatedQueryClient, settings: Setting @yq_v1 @pytest.mark.parametrize("mvp_external_ydb_endpoint", [{"endpoint": "tests-fq-generic-ydb:2136"}], indirect=True) @pytest.mark.parametrize("fq_client", [{"folder_id": "my_folder_slj"}], indirect=True) - @pytest.mark.parametrize("partitions_count", [1, 3]) + @pytest.mark.parametrize("partitions_count", [1, 3] if DEBUG else [3]) @pytest.mark.parametrize("streamlookup", [False, True] if DEBUG else [True]) @pytest.mark.parametrize("testcase", [*range(len(TESTCASES))]) def test_streamlookup( diff --git a/ydb/tests/fq/generic/ya.make b/ydb/tests/fq/generic/ya.make index 06b3bdfc9644..0fbf32d0b231 100644 --- a/ydb/tests/fq/generic/ya.make +++ b/ydb/tests/fq/generic/ya.make @@ -71,4 +71,6 @@ TEST_SRCS( test_ydb.py ) +TIMEOUT(1800) + END() diff --git a/ydb/tests/fq/generic/ydb/01_basic.sh b/ydb/tests/fq/generic/ydb/01_basic.sh index 2a54e14f6b47..f999967044b7 100755 --- a/ydb/tests/fq/generic/ydb/01_basic.sh +++ b/ydb/tests/fq/generic/ydb/01_basic.sh @@ -35,6 +35,12 @@ set -ex (56, 12, "2a02:1812:1713:4f00:517e:1d79:c88b:704", "Elena", 2), (18, 17, "ivalid ip", "newUser", 12); COMMIT; + CREATE TABLE db (b STRING NOT NULL, c Int32, a Int32 NOT NULL, d Int32, f Int32, e Int32, PRIMARY KEY(b, a)); + COMMIT; + INSERT INTO db (a, b, c, d, e, f) VALUES + (1, "2", 3, 4, 5, 6), + (7, "8", 9, 10, 11, 12); + COMMIT; ' retVal=$? From 97e9a76d1ed3c385299860dcf59dae3faee43d43 Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Fri, 27 Sep 2024 18:51:54 +0200 Subject: [PATCH 35/56] merge from main: s3 listing strategy has been fixed (#9499) (#9821) Co-authored-by: Oleg Doronin --- .../s3/provider/yql_s3_listing_strategy.cpp | 20 ++--- ydb/tests/fq/s3/conftest.py | 6 +- ydb/tests/fq/s3/test_s3_1.py | 78 +++++++++++++++++++ ydb/tests/tools/fq_runner/kikimr_utils.py | 12 +++ 4 files changed, 104 insertions(+), 12 deletions(-) diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp index a7d52c408905..4a59bb345c1a 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_listing_strategy.cpp @@ -26,6 +26,11 @@ IOutputStream& operator<<(IOutputStream& stream, const TS3ListingOptions& option namespace { +TString ParseBasePath(const TString& path) { + TString basePath = TString{TStringBuf{path}.RBefore('/')}; + return basePath == path && !basePath.EndsWith('/') ? TString{} : basePath; +} + using namespace NThreading; using namespace NS3Lister; @@ -492,15 +497,10 @@ class TBFSDirectoryResolverIterator : public IS3Lister { return NextDirectoryListeningChunk; } - static TString ParseBasePath(const TString& path) { - TString basePath = TString{TStringBuf{path}.RBefore('/')}; - return basePath == path && !basePath.EndsWith('/') ? TString{} : basePath; - } - void PerformEarlyStop(TListEntries& result, const TString& sourcePrefix) { result.Directories.push_back({.Path = ParseBasePath(sourcePrefix)}); for (auto& directoryPrefix : DirectoryPrefixQueue) { - result.Directories.push_back({.Path = directoryPrefix}); + result.Directories.push_back({.Path = ParseBasePath(directoryPrefix)}); } DirectoryPrefixQueue.clear(); } @@ -518,10 +518,10 @@ class TBFSDirectoryResolverIterator : public IS3Lister { } } else { for (auto& directoryPrefix : listingResult.Directories) { - result.Directories.push_back({.Path = directoryPrefix.Path}); + result.Directories.push_back({.Path = ParseBasePath(directoryPrefix.Path)}); } for (auto& directoryPrefix : DirectoryPrefixQueue) { - result.Directories.push_back({.Path = directoryPrefix}); + result.Directories.push_back({.Path = ParseBasePath(directoryPrefix)}); } DirectoryPrefixQueue.clear(); } @@ -766,10 +766,10 @@ class TConcurrentBFSDirectoryResolverIterator : public IS3Lister { // TODO: add verification auto result = TListEntries{.Objects = Objects}; for (auto& directoryPrefix : DirectoryPrefixQueue) { - result.Directories.push_back({.Path = directoryPrefix}); + result.Directories.push_back({.Path = ParseBasePath(directoryPrefix)}); } for (auto& directoryPrefix: InProgressPaths) { - result.Directories.push_back({.Path = directoryPrefix}); + result.Directories.push_back({.Path = ParseBasePath(directoryPrefix)}); } for (auto& directoryEntry : Directories) { result.Directories.push_back(directoryEntry); diff --git a/ydb/tests/fq/s3/conftest.py b/ydb/tests/fq/s3/conftest.py index 12eeea711880..5393ad70d666 100644 --- a/ydb/tests/fq/s3/conftest.py +++ b/ydb/tests/fq/s3/conftest.py @@ -8,9 +8,10 @@ from ydb.tests.tools.fq_runner.fq_client import FederatedQueryClient from ydb.tests.tools.fq_runner.custom_hooks import * # noqa: F401,F403 Adding custom hooks for YQv2 support -from ydb.tests.tools.fq_runner.kikimr_utils import AddInflightExtension +from ydb.tests.tools.fq_runner.kikimr_utils import AddAllowConcurrentListingsExtension from ydb.tests.tools.fq_runner.kikimr_utils import AddDataInflightExtension from ydb.tests.tools.fq_runner.kikimr_utils import AddFormatSizeLimitExtension +from ydb.tests.tools.fq_runner.kikimr_utils import AddInflightExtension from ydb.tests.tools.fq_runner.kikimr_utils import DefaultConfigExtension from ydb.tests.tools.fq_runner.kikimr_utils import YQv2Extension from ydb.tests.tools.fq_runner.kikimr_utils import ComputeExtension @@ -87,9 +88,10 @@ def kikimr_params(request: pytest.FixtureRequest): def get_kikimr_extensions(s3: S3, yq_version: str, kikimr_settings, mvp_external_ydb_endpoint): return [ + AddFormatSizeLimitExtension(), AddInflightExtension(), + AddAllowConcurrentListingsExtension(), AddDataInflightExtension(), - AddFormatSizeLimitExtension(), DefaultConfigExtension(s3.s3_url), YQv2Extension(yq_version, kikimr_settings.get("is_replace_if_exists", False)), ComputeExtension(), diff --git a/ydb/tests/fq/s3/test_s3_1.py b/ydb/tests/fq/s3/test_s3_1.py index 0f4260bdf4cc..fc7c6868af0e 100644 --- a/ydb/tests/fq/s3/test_s3_1.py +++ b/ydb/tests/fq/s3/test_s3_1.py @@ -557,3 +557,81 @@ def test_top_level_listing(self, kikimr, s3, client, runtime_listing, unique_pre assert result_set.rows[5].items[1].int32_value == 15 assert result_set.rows[5].items[2].int32_value == 33 assert sum(kikimr.control_plane.get_metering(1)) == 10 + + @yq_all + @pytest.mark.parametrize("client", [{"folder_id": "my_folder"}], indirect=True) + @pytest.mark.parametrize("runtime_listing", ["false", "true"]) + @pytest.mark.parametrize("kikimr_params", [{"allow_concurrent_listings": True}], indirect=True) + def test_top_level_listing_2(self, kikimr, s3, client, runtime_listing, unique_prefix): + resource = boto3.resource( + "s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key" + ) + + bucket = resource.Bucket("fbucket") + bucket.create(ACL='public-read') + bucket.objects.all().delete() + + s3_client = boto3.client( + "s3", endpoint_url=s3.s3_url, aws_access_key_id="key", aws_secret_access_key="secret_key" + ) + + fruits = '''Fruit,Price,Weight +Banana,3,100 +Apple,2,22 +Pear,15,33''' + s3_client.put_object(Body=fruits, Bucket='fbucket', Key='2024-08-09.csv', ContentType='text/plain') + s3_client.put_object(Body=fruits, Bucket='fbucket', Key='2024-09-08.csv', ContentType='text/plain') + s3_client.put_object(Body=fruits, Bucket='fbucket', Key='2024-08-08.csv', ContentType='text/plain') + s3_client.put_object(Body=fruits, Bucket='fbucket', Key='/a/2024-08-08.csv', ContentType='text/plain') + s3_client.put_object(Body=fruits, Bucket='fbucket', Key='/b/test.csv', ContentType='text/plain') + + kikimr.control_plane.wait_bootstrap(1) + storage_connection_name = unique_prefix + "test_top_level_listing_2" + client.create_storage_connection(storage_connection_name, "fbucket") + + sql = f''' + pragma s3.UseRuntimeListing="{runtime_listing}"; + + SELECT * + FROM `{storage_connection_name}`.`/2024-08-*` + WITH (format=csv_with_names, SCHEMA ( + Fruit String NOT NULL, + Price Int NOT NULL, + Weight Int NOT NULL + ) + ); + ''' + + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.ANALYTICS).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.COMPLETED) + + data = client.get_result_data(query_id) + result_set = data.result.result_set + logging.debug(str(result_set)) + assert len(result_set.columns) == 3 + assert result_set.columns[0].name == "Fruit" + assert result_set.columns[0].type.type_id == ydb.Type.STRING + assert result_set.columns[1].name == "Price" + assert result_set.columns[1].type.type_id == ydb.Type.INT32 + assert result_set.columns[2].name == "Weight" + assert result_set.columns[2].type.type_id == ydb.Type.INT32 + assert len(result_set.rows) == 6 + assert result_set.rows[0].items[0].bytes_value == b"Banana" + assert result_set.rows[0].items[1].int32_value == 3 + assert result_set.rows[0].items[2].int32_value == 100 + assert result_set.rows[1].items[0].bytes_value == b"Apple" + assert result_set.rows[1].items[1].int32_value == 2 + assert result_set.rows[1].items[2].int32_value == 22 + assert result_set.rows[2].items[0].bytes_value == b"Pear" + assert result_set.rows[2].items[1].int32_value == 15 + assert result_set.rows[2].items[2].int32_value == 33 + assert result_set.rows[3].items[0].bytes_value == b"Banana" + assert result_set.rows[3].items[1].int32_value == 3 + assert result_set.rows[3].items[2].int32_value == 100 + assert result_set.rows[4].items[0].bytes_value == b"Apple" + assert result_set.rows[4].items[1].int32_value == 2 + assert result_set.rows[4].items[2].int32_value == 22 + assert result_set.rows[5].items[0].bytes_value == b"Pear" + assert result_set.rows[5].items[1].int32_value == 15 + assert result_set.rows[5].items[2].int32_value == 33 + assert sum(kikimr.control_plane.get_metering(1)) == 10 diff --git a/ydb/tests/tools/fq_runner/kikimr_utils.py b/ydb/tests/tools/fq_runner/kikimr_utils.py index 9deb83e14262..65c531607956 100644 --- a/ydb/tests/tools/fq_runner/kikimr_utils.py +++ b/ydb/tests/tools/fq_runner/kikimr_utils.py @@ -51,6 +51,18 @@ def apply_to_kikimr(self, request, kikimr): del request.param["inflight"] +class AddAllowConcurrentListingsExtension(ExtensionPoint): + def is_applicable(self, request): + return (hasattr(request, 'param') + and isinstance(request.param, dict) + and "allow_concurrent_listings" in request.param) + + def apply_to_kikimr(self, request, kikimr): + kikimr.allow_concurrent_listings = request.param["allow_concurrent_listings"] + kikimr.compute_plane.fq_config['gateways']['s3']['allow_concurrent_listings'] = kikimr.allow_concurrent_listings + del request.param["allow_concurrent_listings"] + + class AddDataInflightExtension(ExtensionPoint): def is_applicable(self, request): return (hasattr(request, 'param') From afee6e269276b8910a09851b85e92ba455b4f697 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 2 Oct 2024 11:00:02 +0300 Subject: [PATCH 36/56] Yq 3322 Shared reading (to stable) (#9915) Co-authored-by: Pisarenko Grigoriy --- .../libs/actors/clusters_from_connections.cpp | 1 + ydb/core/fq/libs/actors/logging/log.h | 7 + .../checkpointing/checkpoint_coordinator.h | 2 +- .../fq/libs/config/protos/fq_config.proto | 2 + .../libs/config/protos/row_dispatcher.proto | 23 + ydb/core/fq/libs/config/protos/ya.make | 1 + ydb/core/fq/libs/events/event_subspace.h | 2 +- ydb/core/fq/libs/events/ya.make | 2 + ydb/core/fq/libs/init/init.cpp | 13 + ydb/core/fq/libs/init/ya.make | 1 + .../fq/libs/row_dispatcher/actors_factory.cpp | 37 + .../fq/libs/row_dispatcher/actors_factory.h | 25 + .../fq/libs/row_dispatcher/coordinator.cpp | 304 +++++++ ydb/core/fq/libs/row_dispatcher/coordinator.h | 19 + .../libs/row_dispatcher/events/data_plane.cpp | 10 + .../libs/row_dispatcher/events/data_plane.h | 125 +++ .../fq/libs/row_dispatcher/events/ya.make | 14 + .../fq/libs/row_dispatcher/json_filter.cpp | 300 +++++++ ydb/core/fq/libs/row_dispatcher/json_filter.h | 34 + .../fq/libs/row_dispatcher/json_parser.cpp | 337 ++++++++ ydb/core/fq/libs/row_dispatcher/json_parser.h | 32 + .../libs/row_dispatcher/leader_election.cpp | 482 +++++++++++ .../fq/libs/row_dispatcher/leader_election.h | 21 + .../libs/row_dispatcher/protos/events.proto | 78 ++ .../fq/libs/row_dispatcher/protos/ya.make | 15 + .../fq/libs/row_dispatcher/row_dispatcher.cpp | 608 ++++++++++++++ .../fq/libs/row_dispatcher/row_dispatcher.h | 27 + .../row_dispatcher/row_dispatcher_service.cpp | 32 + .../row_dispatcher/row_dispatcher_service.h | 26 + .../fq/libs/row_dispatcher/topic_session.cpp | 777 ++++++++++++++++++ .../fq/libs/row_dispatcher/topic_session.h | 24 + .../libs/row_dispatcher/ut/coordinator_ut.cpp | 166 ++++ .../libs/row_dispatcher/ut/json_filter_ut.cpp | 91 ++ .../libs/row_dispatcher/ut/json_parser_ut.cpp | 122 +++ .../row_dispatcher/ut/leader_election_ut.cpp | 140 ++++ .../row_dispatcher/ut/row_dispatcher_ut.cpp | 342 ++++++++ .../row_dispatcher/ut/topic_session_ut.cpp | 357 ++++++++ ydb/core/fq/libs/row_dispatcher/ut/ya.make | 30 + ydb/core/fq/libs/row_dispatcher/ya.make | 39 + ydb/core/fq/libs/ya.make | 1 + .../kqp/query_compiler/kqp_query_compiler.cpp | 2 +- ydb/library/services/services.proto | 3 + .../{compute => common}/retry_queue.cpp | 73 +- .../actors/{compute => common}/retry_queue.h | 36 +- .../common/ut/retry_events_queue_ut.cpp | 191 +++++ ydb/library/yql/dq/actors/common/ut/ya.make | 16 + ydb/library/yql/dq/actors/common/ya.make | 19 + .../compute/dq_compute_actor_checkpoints.h | 2 +- ydb/library/yql/dq/actors/compute/ya.make | 4 +- .../yql/dq/integration/yql_dq_integration.h | 2 +- .../yql_clickhouse_dq_integration.cpp | 2 +- .../common/dq/yql_dq_integration_impl.cpp | 2 +- .../common/dq/yql_dq_integration_impl.h | 2 +- .../common/proto/gateways_config.proto | 1 + .../common/pushdown/physical_opt.cpp | 73 ++ .../providers/common/pushdown/physical_opt.h | 11 + .../providers/common/pushdown/type_ann.cpp | 36 + .../yql/providers/common/pushdown/type_ann.h | 17 + .../yql/providers/common/pushdown/ya.make | 2 + .../providers/common/ut_helpers/dq_fake_ca.h | 2 +- .../dq/planner/execution_planner.cpp | 2 +- .../provider/ut/pushdown/pushdown_ut.cpp | 2 +- .../yql/providers/generic/provider/ya.make | 1 + .../yql_generic_datasource_type_ann.cpp | 33 +- .../provider/yql_generic_dq_integration.cpp | 2 +- .../provider/yql_generic_physical_opt.cpp | 61 +- .../yql_generic_predicate_pushdown.cpp | 236 +++++- .../provider/yql_generic_predicate_pushdown.h | 2 +- .../pq/async_io/dq_pq_rd_read_actor.cpp | 697 ++++++++++++++++ .../pq/async_io/dq_pq_rd_read_actor.h | 39 + .../pq/async_io/dq_pq_read_actor.cpp | 114 +-- .../pq/async_io/dq_pq_read_actor_base.cpp | 86 ++ .../pq/async_io/dq_pq_read_actor_base.h | 51 ++ ydb/library/yql/providers/pq/async_io/ya.make | 6 + .../yql/providers/pq/common/yql_names.h | 2 + .../pq/expr_nodes/yql_pq_expr_nodes.json | 4 +- .../yql/providers/pq/proto/dq_io.proto | 4 + ydb/library/yql/providers/pq/provider/ya.make | 12 +- .../provider/yql_pq_datasource_type_ann.cpp | 10 +- .../pq/provider/yql_pq_dq_integration.cpp | 88 +- .../pq/provider/yql_pq_logical_opt.cpp | 78 ++ .../providers/pq/provider/yql_pq_settings.cpp | 1 + .../providers/pq/provider/yql_pq_settings.h | 1 + .../s3/actors/yql_s3_raw_read_actor.cpp | 2 +- .../providers/s3/actors/yql_s3_read_actor.cpp | 2 +- .../s3/actors/yql_s3_source_queue.cpp | 2 +- .../s3/provider/yql_s3_dq_integration.cpp | 3 +- .../provider/yql_solomon_dq_integration.cpp | 2 +- .../ydb/provider/yql_ydb_dq_integration.cpp | 2 +- .../public/purecalc/common/no_llvm/ya.make | 1 + .../purecalc/common/no_pg_wrapper/ya.make | 5 + .../yql/public/purecalc/common/ya.make | 1 + .../yql/public/purecalc/common/ya.make.inc | 2 - ydb/public/api/protos/draft/fq.proto | 1 + .../pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp | 359 ++++++++ .../{ => ut}/dq_pq_read_actor_ut.cpp | 2 +- .../{ => ut}/dq_pq_write_actor_ut.cpp | 2 +- ydb/tests/fq/pq_async_io/ut/ya.make | 28 + ydb/tests/fq/pq_async_io/ut_helpers.h | 1 + ydb/tests/fq/pq_async_io/ya.make | 13 +- ydb/tests/fq/yds/test_row_dispatcher.py | 681 +++++++++++++++ ydb/tests/fq/yds/ya.make | 1 + ydb/tests/tools/fq_runner/fq_client.py | 4 +- ydb/tests/tools/fq_runner/kikimr_runner.py | 14 +- 104 files changed, 7578 insertions(+), 247 deletions(-) create mode 100644 ydb/core/fq/libs/config/protos/row_dispatcher.proto create mode 100644 ydb/core/fq/libs/row_dispatcher/actors_factory.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/actors_factory.h create mode 100644 ydb/core/fq/libs/row_dispatcher/coordinator.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/coordinator.h create mode 100644 ydb/core/fq/libs/row_dispatcher/events/data_plane.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/events/data_plane.h create mode 100644 ydb/core/fq/libs/row_dispatcher/events/ya.make create mode 100644 ydb/core/fq/libs/row_dispatcher/json_filter.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/json_filter.h create mode 100644 ydb/core/fq/libs/row_dispatcher/json_parser.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/json_parser.h create mode 100644 ydb/core/fq/libs/row_dispatcher/leader_election.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/leader_election.h create mode 100644 ydb/core/fq/libs/row_dispatcher/protos/events.proto create mode 100644 ydb/core/fq/libs/row_dispatcher/protos/ya.make create mode 100644 ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/row_dispatcher.h create mode 100644 ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.h create mode 100644 ydb/core/fq/libs/row_dispatcher/topic_session.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/topic_session.h create mode 100644 ydb/core/fq/libs/row_dispatcher/ut/coordinator_ut.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/ut/leader_election_ut.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp create mode 100644 ydb/core/fq/libs/row_dispatcher/ut/ya.make create mode 100644 ydb/core/fq/libs/row_dispatcher/ya.make rename ydb/library/yql/dq/actors/{compute => common}/retry_queue.cpp (60%) rename ydb/library/yql/dq/actors/{compute => common}/retry_queue.h (87%) create mode 100644 ydb/library/yql/dq/actors/common/ut/retry_events_queue_ut.cpp create mode 100644 ydb/library/yql/dq/actors/common/ut/ya.make create mode 100644 ydb/library/yql/dq/actors/common/ya.make create mode 100644 ydb/library/yql/providers/common/pushdown/physical_opt.cpp create mode 100644 ydb/library/yql/providers/common/pushdown/physical_opt.h create mode 100644 ydb/library/yql/providers/common/pushdown/type_ann.cpp create mode 100644 ydb/library/yql/providers/common/pushdown/type_ann.h create mode 100644 ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp create mode 100644 ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.h create mode 100644 ydb/library/yql/providers/pq/async_io/dq_pq_read_actor_base.cpp create mode 100644 ydb/library/yql/providers/pq/async_io/dq_pq_read_actor_base.h create mode 100644 ydb/library/yql/public/purecalc/common/no_pg_wrapper/ya.make create mode 100644 ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp rename ydb/tests/fq/pq_async_io/{ => ut}/dq_pq_read_actor_ut.cpp (99%) rename ydb/tests/fq/pq_async_io/{ => ut}/dq_pq_write_actor_ut.cpp (98%) create mode 100644 ydb/tests/fq/pq_async_io/ut/ya.make create mode 100644 ydb/tests/fq/yds/test_row_dispatcher.py diff --git a/ydb/core/fq/libs/actors/clusters_from_connections.cpp b/ydb/core/fq/libs/actors/clusters_from_connections.cpp index 7b2d8187fd1e..f109fa312b37 100644 --- a/ydb/core/fq/libs/actors/clusters_from_connections.cpp +++ b/ydb/core/fq/libs/actors/clusters_from_connections.cpp @@ -51,6 +51,7 @@ void FillPqClusterConfig(NYql::TPqClusterConfig& clusterConfig, clusterConfig.SetUseSsl(ds.secure()); clusterConfig.SetAddBearerToToken(useBearerForYdb); clusterConfig.SetClusterType(TPqClusterConfig::CT_DATA_STREAMS); + clusterConfig.SetSharedReading(ds.shared_reading()); FillClusterAuth(clusterConfig, ds.auth(), authToken, accountIdSignatures); } diff --git a/ydb/core/fq/libs/actors/logging/log.h b/ydb/core/fq/libs/actors/logging/log.h index d5513bc49b78..a5e79b85706e 100644 --- a/ydb/core/fq/libs/actors/logging/log.h +++ b/ydb/core/fq/libs/actors/logging/log.h @@ -47,6 +47,13 @@ #define LOG_STREAMS_STORAGE_SERVICE_AS_WARN(actorSystem, logRecordStream) LOG_STREAMS_IMPL_AS(actorSystem, WARN, STREAMS_STORAGE_SERVICE, logRecordStream) #define LOG_STREAMS_STORAGE_SERVICE_AS_ERROR(actorSystem, logRecordStream) LOG_STREAMS_IMPL_AS(actorSystem, ERROR, STREAMS_STORAGE_SERVICE, logRecordStream) +// Component: ROW_DISPATCHER. +#define LOG_ROW_DISPATCHER_TRACE(logRecordStream) LOG_STREAMS_IMPL(TRACE, FQ_ROW_DISPATCHER, LogPrefix << logRecordStream) +#define LOG_ROW_DISPATCHER_DEBUG(logRecordStream) LOG_STREAMS_IMPL(DEBUG, FQ_ROW_DISPATCHER, LogPrefix << logRecordStream) +#define LOG_ROW_DISPATCHER_INFO(logRecordStream) LOG_STREAMS_IMPL(INFO, FQ_ROW_DISPATCHER, LogPrefix << logRecordStream) +#define LOG_ROW_DISPATCHER_WARN(logRecordStream) LOG_STREAMS_IMPL(WARN, FQ_ROW_DISPATCHER, LogPrefix << logRecordStream) +#define LOG_ROW_DISPATCHER_ERROR(logRecordStream) LOG_STREAMS_IMPL(ERROR, FQ_ROW_DISPATCHER, LogPrefix << logRecordStream) + // Component: STREAMS_SCHEDULER_SERVICE. #define LOG_STREAMS_SCHEDULER_SERVICE_EMERG(logRecordStream) LOG_STREAMS_IMPL(EMERG, STREAMS_SCHEDULER_SERVICE, logRecordStream) #define LOG_STREAMS_SCHEDULER_SERVICE_ALERT(logRecordStream) LOG_STREAMS_IMPL(ALERT, STREAMS_SCHEDULER_SERVICE, logRecordStream) diff --git a/ydb/core/fq/libs/checkpointing/checkpoint_coordinator.h b/ydb/core/fq/libs/checkpointing/checkpoint_coordinator.h index fd683ba62d88..280130f38163 100644 --- a/ydb/core/fq/libs/checkpointing/checkpoint_coordinator.h +++ b/ydb/core/fq/libs/checkpointing/checkpoint_coordinator.h @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include diff --git a/ydb/core/fq/libs/config/protos/fq_config.proto b/ydb/core/fq/libs/config/protos/fq_config.proto index fcca1dafe286..edbf50b40b65 100644 --- a/ydb/core/fq/libs/config/protos/fq_config.proto +++ b/ydb/core/fq/libs/config/protos/fq_config.proto @@ -22,6 +22,7 @@ import "ydb/core/fq/libs/config/protos/quotas_manager.proto"; import "ydb/core/fq/libs/config/protos/rate_limiter.proto"; import "ydb/core/fq/libs/config/protos/read_actors_factory.proto"; import "ydb/core/fq/libs/config/protos/resource_manager.proto"; +import "ydb/core/fq/libs/config/protos/row_dispatcher.proto"; import "ydb/core/fq/libs/config/protos/test_connection.proto"; import "ydb/core/fq/libs/config/protos/token_accessor.proto"; import "ydb/library/folder_service/proto/config.proto"; @@ -53,4 +54,5 @@ message TConfig { TRateLimiterConfig RateLimiter = 22; bool EnableTaskCounters = 23; TComputeConfig Compute = 24; + TRowDispatcherConfig RowDispatcher = 25; } diff --git a/ydb/core/fq/libs/config/protos/row_dispatcher.proto b/ydb/core/fq/libs/config/protos/row_dispatcher.proto new file mode 100644 index 000000000000..10ca10285ea0 --- /dev/null +++ b/ydb/core/fq/libs/config/protos/row_dispatcher.proto @@ -0,0 +1,23 @@ +syntax = "proto3"; +option cc_enable_arenas = true; + +package NFq.NConfig; +option java_package = "ru.yandex.kikimr.proto"; + +import "ydb/core/fq/libs/config/protos/storage.proto"; + +//////////////////////////////////////////////////////////// + +message TRowDispatcherCoordinatorConfig { + TYdbStorageConfig Database = 1; + string CoordinationNodePath = 2; +} +message TRowDispatcherConfig { + bool Enabled = 1; + uint64 TimeoutBeforeStartSessionSec = 2; + uint64 SendStatusPeriodSec = 3; + uint64 MaxSessionUsedMemory = 4; + bool WithoutConsumer = 5; + TRowDispatcherCoordinatorConfig Coordinator = 6; + +} diff --git a/ydb/core/fq/libs/config/protos/ya.make b/ydb/core/fq/libs/config/protos/ya.make index 2bf50a9a41fd..92acb431961c 100644 --- a/ydb/core/fq/libs/config/protos/ya.make +++ b/ydb/core/fq/libs/config/protos/ya.make @@ -22,6 +22,7 @@ SRCS( rate_limiter.proto read_actors_factory.proto resource_manager.proto + row_dispatcher.proto storage.proto test_connection.proto token_accessor.proto diff --git a/ydb/core/fq/libs/events/event_subspace.h b/ydb/core/fq/libs/events/event_subspace.h index 5bef4dc1ea48..d88d44d1a8ee 100644 --- a/ydb/core/fq/libs/events/event_subspace.h +++ b/ydb/core/fq/libs/events/event_subspace.h @@ -32,7 +32,7 @@ struct TYqEventSubspace { ControlPlaneConfig, YdbCompute, TableOverFq, - + RowDispatcher, SubspacesEnd, }; diff --git a/ydb/core/fq/libs/events/ya.make b/ydb/core/fq/libs/events/ya.make index ad44506d8698..21f47a99f65e 100644 --- a/ydb/core/fq/libs/events/ya.make +++ b/ydb/core/fq/libs/events/ya.make @@ -8,9 +8,11 @@ PEERDIR( ydb/library/actors/core ydb/core/fq/libs/graph_params/proto ydb/core/fq/libs/protos + ydb/core/fq/libs/row_dispatcher/protos ydb/library/yql/core/facade ydb/library/yql/providers/common/db_id_async_resolver ydb/library/yql/providers/dq/provider + ydb/library/yql/providers/pq/proto ydb/library/yql/public/issue ydb/public/api/protos ydb/public/sdk/cpp/client/ydb_table diff --git a/ydb/core/fq/libs/init/init.cpp b/ydb/core/fq/libs/init/init.cpp index 19d745627364..bb071a5a618d 100644 --- a/ydb/core/fq/libs/init/init.cpp +++ b/ydb/core/fq/libs/init/init.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -193,6 +194,18 @@ void Init( credentialsFactory = NYql::CreateSecuredServiceAccountCredentialsOverTokenAccessorFactory(tokenAccessorConfig.GetEndpoint(), tokenAccessorConfig.GetUseSsl(), caContent, tokenAccessorConfig.GetConnectionPoolSize()); } + if (protoConfig.GetRowDispatcher().GetEnabled()) { + auto rowDispatcher = NFq::NewRowDispatcherService( + protoConfig.GetRowDispatcher(), + protoConfig.GetCommon(), + NKikimr::CreateYdbCredentialsProviderFactory, + yqSharedResources, + credentialsFactory, + tenant, + yqCounters->GetSubgroup("subsystem", "row_dispatcher")); + actorRegistrator(NFq::RowDispatcherServiceActorId(), rowDispatcher.release()); + } + auto s3ActorsFactory = NYql::NDq::CreateS3ActorsFactory(); if (protoConfig.GetPrivateApi().GetEnabled()) { diff --git a/ydb/core/fq/libs/init/ya.make b/ydb/core/fq/libs/init/ya.make index edcbe86304aa..857052758169 100644 --- a/ydb/core/fq/libs/init/ya.make +++ b/ydb/core/fq/libs/init/ya.make @@ -24,6 +24,7 @@ PEERDIR( ydb/core/fq/libs/quota_manager ydb/core/fq/libs/rate_limiter/control_plane_service ydb/core/fq/libs/rate_limiter/quoter_service + ydb/core/fq/libs/row_dispatcher ydb/core/fq/libs/shared_resources ydb/core/fq/libs/test_connection ydb/core/protos diff --git a/ydb/core/fq/libs/row_dispatcher/actors_factory.cpp b/ydb/core/fq/libs/row_dispatcher/actors_factory.cpp new file mode 100644 index 000000000000..b3b4d8517c75 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/actors_factory.cpp @@ -0,0 +1,37 @@ +#include + +#include + +namespace NFq::NRowDispatcher { + + +struct TActorFactory : public IActorFactory { + TActorFactory() {} + + NActors::TActorId RegisterTopicSession( + const TString& topicPath, + const NConfig::TRowDispatcherConfig& config, + NActors::TActorId rowDispatcherActorId, + ui32 partitionId, + NYdb::TDriver driver, + std::shared_ptr credentialsProviderFactory, + const ::NMonitoring::TDynamicCounterPtr& counters) const override { + + auto actorPtr = NFq::NewTopicSession( + topicPath, + config, + rowDispatcherActorId, + partitionId, + std::move(driver), + credentialsProviderFactory, + counters + ); + return NActors::TlsActivationContext->ExecutorThread.RegisterActor(actorPtr.release(), NActors::TMailboxType::HTSwap, Max()); + } +}; + +IActorFactory::TPtr CreateActorFactory() { + return MakeIntrusive(); +} + +} diff --git a/ydb/core/fq/libs/row_dispatcher/actors_factory.h b/ydb/core/fq/libs/row_dispatcher/actors_factory.h new file mode 100644 index 000000000000..6cc718b41cde --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/actors_factory.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include +#include + +namespace NFq::NRowDispatcher { + +struct IActorFactory : public TThrRefBase { + using TPtr = TIntrusivePtr; + + virtual NActors::TActorId RegisterTopicSession( + const TString& topicPath, + const NConfig::TRowDispatcherConfig& config, + NActors::TActorId rowDispatcherActorId, + ui32 partitionId, + NYdb::TDriver driver, + std::shared_ptr credentialsProviderFactory, + const ::NMonitoring::TDynamicCounterPtr& counters) const = 0; +}; + +IActorFactory::TPtr CreateActorFactory(); + +} diff --git a/ydb/core/fq/libs/row_dispatcher/coordinator.cpp b/ydb/core/fq/libs/row_dispatcher/coordinator.cpp new file mode 100644 index 000000000000..dfc483ec939d --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/coordinator.cpp @@ -0,0 +1,304 @@ +#include "coordinator.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace NFq { + +using namespace NActors; +using namespace NThreading; + +using NYql::TIssues; + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +struct TCoordinatorMetrics { + explicit TCoordinatorMetrics(const ::NMonitoring::TDynamicCounterPtr& counters) + : Counters(counters) { + IncomingRequests = Counters->GetCounter("IncomingRequests", true); + LeaderChangedCount = Counters->GetCounter("LeaderChangedCount"); + } + + ::NMonitoring::TDynamicCounterPtr Counters; + ::NMonitoring::TDynamicCounters::TCounterPtr IncomingRequests; + ::NMonitoring::TDynamicCounters::TCounterPtr LeaderChangedCount; + +}; + +class TActorCoordinator : public TActorBootstrapped { + + struct TPartitionKey { + TString Endpoint; + TString Database; + TString TopicName; + ui64 PartitionId; + + size_t Hash() const noexcept { + ui64 hash = std::hash()(Endpoint); + hash = CombineHashes(hash, std::hash()(Database)); + hash = CombineHashes(hash, std::hash()(TopicName)); + hash = CombineHashes(hash, std::hash()(PartitionId)); + return hash; + } + bool operator==(const TPartitionKey& other) const { + return Endpoint == other.Endpoint && Database == other.Database + && TopicName == other.TopicName && PartitionId == other.PartitionId; + } + }; + + struct TPartitionKeyHash { + int operator()(const TPartitionKey& k) const { + return k.Hash(); + } + }; + + struct RowDispatcherInfo { + RowDispatcherInfo(bool connected, bool isLocal) + : Connected(connected) + , IsLocal(isLocal) {} + bool Connected = false; + bool IsLocal = false; + THashSet Locations; + }; + + NConfig::TRowDispatcherCoordinatorConfig Config; + TYqSharedResources::TPtr YqSharedResources; + TActorId LocalRowDispatcherId; + const TString LogPrefix; + const TString Tenant; + TMap RowDispatchers; + THashMap PartitionLocations; + TCoordinatorMetrics Metrics; + ui64 LocationRandomCounter = 0; + +public: + TActorCoordinator( + NActors::TActorId localRowDispatcherId, + const NConfig::TRowDispatcherCoordinatorConfig& config, + const TYqSharedResources::TPtr& yqSharedResources, + const TString& tenant, + const ::NMonitoring::TDynamicCounterPtr& counters); + + void Bootstrap(); + + static constexpr char ActorName[] = "FQ_RD_COORDINATOR"; + + void Handle(NActors::TEvents::TEvPing::TPtr& ev); + void HandleDisconnected(TEvInterconnect::TEvNodeDisconnected::TPtr& ev); + void HandleConnected(TEvInterconnect::TEvNodeConnected::TPtr& ev); + void Handle(NActors::TEvents::TEvUndelivered::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvCoordinatorRequest::TPtr& ev); + + STRICT_STFUNC( + StateFunc, { + hFunc(NActors::TEvents::TEvPing, Handle); + hFunc(TEvInterconnect::TEvNodeConnected, HandleConnected); + hFunc(TEvInterconnect::TEvNodeDisconnected, HandleDisconnected); + hFunc(NActors::TEvents::TEvUndelivered, Handle); + hFunc(NFq::TEvRowDispatcher::TEvCoordinatorChanged, Handle); + hFunc(NFq::TEvRowDispatcher::TEvCoordinatorRequest, Handle); + }) + +private: + + void AddRowDispatcher(NActors::TActorId actorId, bool isLocal); + void PrintInternalState(); + NActors::TActorId GetAndUpdateLocation(const TPartitionKey& key); +}; + +TActorCoordinator::TActorCoordinator( + NActors::TActorId localRowDispatcherId, + const NConfig::TRowDispatcherCoordinatorConfig& config, + const TYqSharedResources::TPtr& yqSharedResources, + const TString& tenant, + const ::NMonitoring::TDynamicCounterPtr& counters) + : Config(config) + , YqSharedResources(yqSharedResources) + , LocalRowDispatcherId(localRowDispatcherId) + , LogPrefix("Coordinator: ") + , Tenant(tenant) + , Metrics(counters) { + AddRowDispatcher(localRowDispatcherId, true); +} + +void TActorCoordinator::Bootstrap() { + Become(&TActorCoordinator::StateFunc); + Send(LocalRowDispatcherId, new NFq::TEvRowDispatcher::TEvCoordinatorChangesSubscribe()); + LOG_ROW_DISPATCHER_DEBUG("Successfully bootstrapped coordinator, id " << SelfId()); +} + +void TActorCoordinator::AddRowDispatcher(NActors::TActorId actorId, bool isLocal) { + auto it = RowDispatchers.find(actorId); + if (it != RowDispatchers.end()) { + it->second.Connected = true; + return; + } + + for (auto& [oldActorId, info] : RowDispatchers) { + if (oldActorId.NodeId() != actorId.NodeId()) { + continue; + } + + LOG_ROW_DISPATCHER_TRACE("Move all Locations from old actor " << oldActorId.ToString() << " to new " << actorId.ToString()); + for (auto& key : info.Locations) { + PartitionLocations[key] = actorId; + } + info.Connected = true; + auto node = RowDispatchers.extract(oldActorId); + node.key() = actorId; + RowDispatchers.insert(std::move(node)); + return; + } + + RowDispatchers.emplace(actorId, RowDispatcherInfo{true, isLocal}); +} + +void TActorCoordinator::Handle(NActors::TEvents::TEvPing::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvPing received, " << ev->Sender); + AddRowDispatcher(ev->Sender, false); + PrintInternalState(); + LOG_ROW_DISPATCHER_TRACE("Send TEvPong to " << ev->Sender); + Send(ev->Sender, new NActors::TEvents::TEvPong(), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession); +} + +void TActorCoordinator::PrintInternalState() { + TStringStream str; + str << "Known row dispatchers:\n"; + + for (const auto& [actorId, info] : RowDispatchers) { + str << " " << actorId << ", connected " << info.Connected << "\n"; + } + + str << "\nLocations:\n"; + for (auto& [key, actorId] : PartitionLocations) { + str << " " << key.Endpoint << " / " << key.Database << " / " << key.TopicName << ", partId " << key.PartitionId << ", row dispatcher actor id: " << actorId << "\n"; + } + LOG_ROW_DISPATCHER_DEBUG(str.Str()); +} + +void TActorCoordinator::HandleConnected(TEvInterconnect::TEvNodeConnected::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("EvNodeConnected " << ev->Get()->NodeId); + // Dont set Connected = true. + // Wait TEvPing from row dispatchers. +} + +void TActorCoordinator::HandleDisconnected(TEvInterconnect::TEvNodeDisconnected::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("TEvNodeDisconnected, node id " << ev->Get()->NodeId); + + for (auto& [actorId, info] : RowDispatchers) { + if (ev->Get()->NodeId != actorId.NodeId()) { + continue; + } + Y_ENSURE(!info.IsLocal, "EvNodeDisconnected from local row dispatcher"); + info.Connected = false; + } +} + +void TActorCoordinator::Handle(NActors::TEvents::TEvUndelivered::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("TEvUndelivered, ev: " << ev->Get()->ToString()); + + for (auto& [actorId, info] : RowDispatchers) { + if (ev->Sender != actorId) { + continue; + } + info.Connected = false; + return; + } +} + +void TActorCoordinator::Handle(NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("New leader " << ev->Get()->CoordinatorActorId << ", SelfId " << SelfId()); + Metrics.LeaderChangedCount->Inc(); +} + +NActors::TActorId TActorCoordinator::GetAndUpdateLocation(const TPartitionKey& key) { + Y_ENSURE(!PartitionLocations.contains(key)); + auto rand = LocationRandomCounter++ % RowDispatchers.size(); + + auto it = std::begin(RowDispatchers); + std::advance(it, rand); + + for (size_t i = 0; i < RowDispatchers.size(); ++i) { + auto& info = it->second; + if (!info.Connected) { + it++; + if (it == std::end(RowDispatchers)) { + it = std::begin(RowDispatchers); + } + continue; + } + PartitionLocations[key] = it->first; + it->second.Locations.insert(key); + return it->first; + } + Y_ENSURE(false, "Local row dispatcher should always be connected"); +} + +void TActorCoordinator::Handle(NFq::TEvRowDispatcher::TEvCoordinatorRequest::TPtr& ev) { + const auto source = ev->Get()->Record.GetSource(); + + TStringStream str; + str << "TEvCoordinatorRequest from " << ev->Sender.ToString() << ", " << source.GetTopicPath() << ", partIds: "; + for (auto& partitionId : ev->Get()->Record.GetPartitionId()) { + str << partitionId << ", "; + } + LOG_ROW_DISPATCHER_DEBUG(str.Str()); + Metrics.IncomingRequests->Inc(); + Y_ENSURE(!RowDispatchers.empty()); + + TMap> tmpResult; + + for (auto& partitionId : ev->Get()->Record.GetPartitionId()) { + TPartitionKey key{source.GetEndpoint(), source.GetDatabase(), source.GetTopicPath(), partitionId}; + auto locationIt = PartitionLocations.find(key); + NActors::TActorId rowDispatcherId; + if (locationIt != PartitionLocations.end()) { + rowDispatcherId = locationIt->second; + } else { + rowDispatcherId = GetAndUpdateLocation(key); + } + tmpResult[rowDispatcherId].insert(partitionId); + } + + auto response = std::make_unique(); + for (const auto& [actorId, partitions] : tmpResult) { + auto* partitionsProto = response->Record.AddPartitions(); + ActorIdToProto(actorId, partitionsProto->MutableActorId()); + for (auto partitionId : partitions) { + partitionsProto->AddPartitionId(partitionId); + } + } + + LOG_ROW_DISPATCHER_DEBUG("Send TEvCoordinatorResult to " << ev->Sender); + Send(ev->Sender, response.release(), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession, ev->Cookie); + PrintInternalState(); +} + + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr NewCoordinator( + NActors::TActorId rowDispatcherId, + const NConfig::TRowDispatcherCoordinatorConfig& config, + const TYqSharedResources::TPtr& yqSharedResources, + const TString& tenant, + const ::NMonitoring::TDynamicCounterPtr& counters) +{ + return std::unique_ptr(new TActorCoordinator(rowDispatcherId, config, yqSharedResources, tenant, counters)); +} + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/coordinator.h b/ydb/core/fq/libs/row_dispatcher/coordinator.h new file mode 100644 index 000000000000..60bd2f4f0ad0 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/coordinator.h @@ -0,0 +1,19 @@ +#pragma once + +#include + +#include +#include + +namespace NFq { + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr NewCoordinator( + NActors::TActorId rowDispatcherId, + const NConfig::TRowDispatcherCoordinatorConfig& config, + const TYqSharedResources::TPtr& yqSharedResources, + const TString& tenant, + const ::NMonitoring::TDynamicCounterPtr& counters); + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/events/data_plane.cpp b/ydb/core/fq/libs/row_dispatcher/events/data_plane.cpp new file mode 100644 index 000000000000..dce6540d44a9 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/events/data_plane.cpp @@ -0,0 +1,10 @@ +#include "data_plane.h" + +namespace NFq { + +NActors::TActorId RowDispatcherServiceActorId() { + constexpr TStringBuf name = "ROW_DISP_DP"; + return NActors::TActorId(0, name); +} + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/events/data_plane.h b/ydb/core/fq/libs/row_dispatcher/events/data_plane.h new file mode 100644 index 000000000000..5cecb5251674 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/events/data_plane.h @@ -0,0 +1,125 @@ +#pragma once + +#include +#include +#include + +#include +#include + +namespace NFq { + +NActors::TActorId RowDispatcherServiceActorId(); + +struct TEvRowDispatcher { + // Event ids. + enum EEv : ui32 { + EvCoordinatorChanged = YqEventSubspaceBegin(TYqEventSubspace::RowDispatcher), + EvStartSession, + EvStartSessionAck, + EvNewDataArrived, + EvGetNextBatch, + EvMessageBatch, + EvStatus, + EvStopSession, + EvSessionError, + EvCoordinatorChangesSubscribe, + EvCoordinatorRequest, + EvCoordinatorResult, + EvEnd, + }; + + struct TEvCoordinatorChanged : NActors::TEventLocal { + TEvCoordinatorChanged(NActors::TActorId coordinatorActorId) + : CoordinatorActorId(coordinatorActorId) { + } + NActors::TActorId CoordinatorActorId; + }; + + struct TEvCoordinatorChangesSubscribe : public NActors::TEventLocal {}; + + struct TEvCoordinatorRequest : public NActors::TEventPB { + TEvCoordinatorRequest() = default; + TEvCoordinatorRequest( + const NYql::NPq::NProto::TDqPqTopicSource& sourceParams, + const std::vector& partitionIds) { + *Record.MutableSource() = sourceParams; + for (const auto& id : partitionIds) { + Record.AddPartitionId(id); + } + } + }; + + struct TEvCoordinatorResult : public NActors::TEventPB { + TEvCoordinatorResult() = default; + }; + + struct TEvStartSession : public NActors::TEventPB { + + TEvStartSession() = default; + TEvStartSession( + const NYql::NPq::NProto::TDqPqTopicSource& sourceParams, + ui64 partitionId, + const TString token, + TMaybe readOffset, + ui64 startingMessageTimestampMs, + const TString& queryId) { + *Record.MutableSource() = sourceParams; + Record.SetPartitionId(partitionId); + Record.SetToken(token); + if (readOffset) { + Record.SetOffset(*readOffset); + } + Record.SetStartingMessageTimestampMs(startingMessageTimestampMs); + Record.SetQueryId(queryId); + } + }; + + struct TEvStartSessionAck : public NActors::TEventPB { + TEvStartSessionAck() = default; + explicit TEvStartSessionAck( + const NFq::NRowDispatcherProto::TEvStartSession& consumer) { + *Record.MutableConsumer() = consumer; + } + }; + + struct TEvNewDataArrived : public NActors::TEventPB { + TEvNewDataArrived() = default; + NActors::TActorId ReadActorId; + }; + + struct TEvGetNextBatch : public NActors::TEventPB { + TEvGetNextBatch() = default; + }; + + struct TEvStopSession : public NActors::TEventPB { + TEvStopSession() = default; + }; + + struct TEvMessageBatch : public NActors::TEventPB { + TEvMessageBatch() = default; + NActors::TActorId ReadActorId; + }; + + struct TEvStatus : public NActors::TEventPB { + TEvStatus() = default; + NActors::TActorId ReadActorId; + }; + + struct TEvSessionError : public NActors::TEventPB { + TEvSessionError() = default; + NActors::TActorId ReadActorId; + }; +}; + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/events/ya.make b/ydb/core/fq/libs/row_dispatcher/events/ya.make new file mode 100644 index 000000000000..60f0b00e7e90 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/events/ya.make @@ -0,0 +1,14 @@ +LIBRARY() + +SRCS( + data_plane.cpp +) + +PEERDIR( + ydb/core/fq/libs/events + ydb/core/fq/libs/row_dispatcher/protos + ydb/library/actors/core + ydb/library/yql/providers/pq/provider +) + +END() diff --git a/ydb/core/fq/libs/row_dispatcher/json_filter.cpp b/ydb/core/fq/libs/row_dispatcher/json_filter.cpp new file mode 100644 index 000000000000..8b7d46a690f2 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/json_filter.cpp @@ -0,0 +1,300 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace { + +using TCallback = NFq::TJsonFilter::TCallback; +const char* OffsetFieldName = "_offset"; +TString LogPrefix = "JsonFilter: "; + +void AddField(NYT::TNode& node, const TString& fieldName, const TString& fieldType) { + node.Add( + NYT::TNode::CreateList() + .Add(fieldName) + .Add(NYT::TNode::CreateList().Add("DataType").Add(fieldType)) + ); +} + +NYT::TNode MakeInputSchema(const TVector& columns) { + auto structMembers = NYT::TNode::CreateList(); + AddField(structMembers, OffsetFieldName, "Uint64"); + for (const auto& col : columns) { + AddField(structMembers, col, "String"); + } + return NYT::TNode::CreateList().Add("StructType").Add(std::move(structMembers)); +} + +NYT::TNode MakeOutputSchema() { + auto structMembers = NYT::TNode::CreateList(); + AddField(structMembers, OffsetFieldName, "Uint64"); + AddField(structMembers, "data", "String"); + return NYT::TNode::CreateList().Add("StructType").Add(std::move(structMembers)); +} + +class TFilterInputSpec : public NYql::NPureCalc::TInputSpecBase { +public: + TFilterInputSpec(const NYT::TNode& schema) + : Schemas({schema}) { + } + + const TVector& GetSchemas() const override { + return Schemas; + } + +private: + TVector Schemas; +}; + +class TFilterInputConsumer : public NYql::NPureCalc::IConsumer>> { +public: + TFilterInputConsumer( + const TFilterInputSpec& spec, + NYql::NPureCalc::TWorkerHolder worker) + : Worker(std::move(worker)) { + const NKikimr::NMiniKQL::TStructType* structType = Worker->GetInputType(); + const auto count = structType->GetMembersCount(); + + THashMap schemaPositions; + for (ui32 i = 0; i < count; ++i) { + const auto name = structType->GetMemberName(i); + if (name == OffsetFieldName) { + OffsetPosition = i; + continue; + } + schemaPositions[name] = i; + } + + const NYT::TNode& schema = spec.GetSchemas()[0]; + const auto& fields = schema[1]; + Y_ENSURE(count == fields.Size()); + Y_ENSURE(fields.IsList()); + for (size_t i = 0; i < fields.Size(); ++i) { + auto name = fields[i][0].AsString(); + if (name == OffsetFieldName) { + continue; + } + FieldsPositions.push_back(schemaPositions[name]); + } + } + + ~TFilterInputConsumer() override { + with_lock(Worker->GetScopedAlloc()) { + Cache.Clear(); + } + } + + void OnObject(std::pair> value) override { + NKikimr::NMiniKQL::TThrowingBindTerminator bind; + + with_lock (Worker->GetScopedAlloc()) { + auto& holderFactory = Worker->GetGraph().GetHolderFactory(); + NYql::NUdf::TUnboxedValue* items = nullptr; + + NYql::NUdf::TUnboxedValue result = Cache.NewArray( + holderFactory, + static_cast(value.second.size() + 1), + items); + + items[OffsetPosition] = NYql::NUdf::TUnboxedValuePod(value.first); + + Y_ENSURE(FieldsPositions.size() == value.second.size()); + + size_t i = 0; + for (const auto& v : value.second) { + NYql::NUdf::TStringValue str(v); + items[FieldsPositions[i++]] = NYql::NUdf::TUnboxedValuePod(std::move(str)); + } + Worker->Push(std::move(result)); + } + } + + void OnFinish() override { + NKikimr::NMiniKQL::TBindTerminator bind(Worker->GetGraph().GetTerminator()); + with_lock(Worker->GetScopedAlloc()) { + Worker->OnFinish(); + } + } + +private: + NYql::NPureCalc::TWorkerHolder Worker; + NKikimr::NMiniKQL::TPlainContainerCache Cache; + size_t OffsetPosition = 0; + TVector FieldsPositions; +}; + +class TFilterOutputConsumer: public NYql::NPureCalc::IConsumer> { +public: + TFilterOutputConsumer(TCallback callback) + : Callback(callback) { + } + + void OnObject(std::pair value) override { + Callback(value.first, value.second); + } + + void OnFinish() override { + Y_UNREACHABLE(); + } +private: + TCallback Callback; +}; + +class TFilterOutputSpec: public NYql::NPureCalc::TOutputSpecBase { +public: + explicit TFilterOutputSpec(const NYT::TNode& schema) + : Schema(schema) + {} + +public: + const NYT::TNode& GetSchema() const override { + return Schema; + } + +private: + NYT::TNode Schema; +}; + +class TFilterPushRelayImpl: public NYql::NPureCalc::IConsumer { +public: + TFilterPushRelayImpl(const TFilterOutputSpec& /*outputSpec*/, NYql::NPureCalc::IPushStreamWorker* worker, THolder>> underlying) + : Underlying(std::move(underlying)) + , Worker(worker) + {} +public: + void OnObject(const NYql::NUdf::TUnboxedValue* value) override { + auto unguard = Unguard(Worker->GetScopedAlloc()); + Y_ENSURE(value->GetListLength() == 2); + ui64 offset = value->GetElement(0).Get(); + const auto& cell = value->GetElement(1); + Y_ENSURE(cell); + TString str(cell.AsStringRef()); + Underlying->OnObject(std::make_pair(offset, str)); + } + + void OnFinish() override { + auto unguard = Unguard(Worker->GetScopedAlloc()); + Underlying->OnFinish(); + } + +private: + THolder>> Underlying; + NYql::NPureCalc::IWorker* Worker; +}; + +} + +template <> +struct NYql::NPureCalc::TInputSpecTraits { + static constexpr bool IsPartial = false; + static constexpr bool SupportPushStreamMode = true; + + using TConsumerType = THolder>>>; + + static TConsumerType MakeConsumer( + const TFilterInputSpec& spec, + NYql::NPureCalc::TWorkerHolder worker) + { + return MakeHolder(spec, std::move(worker)); + } +}; + +template <> +struct NYql::NPureCalc::TOutputSpecTraits { + static const constexpr bool IsPartial = false; + static const constexpr bool SupportPushStreamMode = true; + + static void SetConsumerToWorker(const TFilterOutputSpec& outputSpec, NYql::NPureCalc::IPushStreamWorker* worker, THolder>> consumer) { + worker->SetConsumer(MakeHolder(outputSpec, worker, std::move(consumer))); + } +}; + +namespace NFq { + +class TJsonFilter::TImpl { +public: + TImpl(const TVector& columns, + const TVector& types, + const TString& whereFilter, + TCallback callback) + : Sql(GenerateSql(columns, types, whereFilter)) { + auto factory = NYql::NPureCalc::MakeProgramFactory(NYql::NPureCalc::TProgramFactoryOptions()); + + LOG_ROW_DISPATCHER_DEBUG("Creating program..."); + Program = factory->MakePushStreamProgram( + TFilterInputSpec(MakeInputSchema(columns)), + TFilterOutputSpec(MakeOutputSchema()), + Sql, + NYql::NPureCalc::ETranslationMode::SQL + ); + InputConsumer = Program->Apply(MakeHolder(callback)); + LOG_ROW_DISPATCHER_DEBUG("Program created"); + } + + void Push(ui64 offset, const TList& value) { + InputConsumer->OnObject(std::make_pair(offset, value)); + } + + TString GetSql() const { + return Sql; + } + +private: + TString GenerateSql(const TVector& columnNames, const TVector& columnTypes, const TString& whereFilter) { + TStringStream str; + str << "$fields = SELECT "; + Y_ABORT_UNLESS(columnNames.size() == columnTypes.size()); + str << OffsetFieldName << ", "; + for (size_t i = 0; i < columnNames.size(); ++i) { + str << "CAST(" << columnNames[i] << " as " << columnTypes[i] << ") as " << columnNames[i] << ((i != columnNames.size() - 1) ? "," : ""); + } + str << " FROM Input;\n"; + str << "$filtered = SELECT * FROM $fields " << whereFilter << ";\n"; + + str << "SELECT " << OffsetFieldName << ", Unwrap(Json::SerializeJson(Yson::From(RemoveMembers(TableRow(), [\"" << OffsetFieldName; + str << "\"])))) as data FROM $filtered"; + LOG_ROW_DISPATCHER_DEBUG("Generated sql: " << str.Str()); + return str.Str(); + } + +private: + THolder> Program; + THolder>>> InputConsumer; + const TString Sql; +}; + +TJsonFilter::TJsonFilter( + const TVector& columns, + const TVector& types, + const TString& whereFilter, + TCallback callback) + : Impl(std::make_unique(columns, types, whereFilter, callback)) { +} + +TJsonFilter::~TJsonFilter() { +} + +void TJsonFilter::Push(ui64 offset, const TList& value) { + Impl->Push(offset, value); +} + +TString TJsonFilter::GetSql() { + return Impl->GetSql(); +} + +std::unique_ptr NewJsonFilter( + const TVector& columns, + const TVector& types, + const TString& whereFilter, + TCallback callback) { + return std::unique_ptr(new TJsonFilter(columns, types, whereFilter, callback)); +} + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/json_filter.h b/ydb/core/fq/libs/row_dispatcher/json_filter.h new file mode 100644 index 000000000000..f1694a277fbb --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/json_filter.h @@ -0,0 +1,34 @@ + +#pragma once + +namespace NFq { + +#include +#include + +class TJsonFilter { +public: + using TCallback = std::function; + +public: + TJsonFilter( + const TVector& columns, + const TVector& types, + const TString& whereFilter, + TCallback callback); + ~TJsonFilter(); + void Push(ui64 offset, const TList& value); + TString GetSql(); + +private: + class TImpl; + const std::unique_ptr Impl; +}; + +std::unique_ptr NewJsonFilter( + const TVector& columns, + const TVector& types, + const TString& whereFilter, + TJsonFilter::TCallback callback); + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/json_parser.cpp b/ydb/core/fq/libs/row_dispatcher/json_parser.cpp new file mode 100644 index 000000000000..84ca3018b509 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/json_parser.cpp @@ -0,0 +1,337 @@ +#include + +#include +#include +#include +#include +#include + + +namespace { + +using TCallback = NFq::TJsonParser::TCallback; +using TInputConsumerArg = std::pair; +const char* OffsetFieldName = "_offset"; +TString LogPrefix = "JsonParser: "; + +void AddField(NYT::TNode& node, const TString& fieldName, const TString& fieldType) { + node.Add( + NYT::TNode::CreateList() + .Add(fieldName) + .Add(NYT::TNode::CreateList().Add("DataType").Add(fieldType)) + ); +} + +NYT::TNode MakeInputSchema() { + auto structMembers = NYT::TNode::CreateList(); + AddField(structMembers, OffsetFieldName, "Uint64"); + AddField(structMembers, "data", "String"); + return NYT::TNode::CreateList().Add("StructType").Add(std::move(structMembers)); +} + +NYT::TNode MakeOutputSchema(const TVector& columns) { + auto structMembers = NYT::TNode::CreateList(); + AddField(structMembers, OffsetFieldName, "Uint64"); + for (const auto& col : columns) { + AddField(structMembers, col, "String"); + } + return NYT::TNode::CreateList().Add("StructType").Add(std::move(structMembers)); +} + +class TParserInputConsumer : public NYql::NPureCalc::IConsumer { +public: + explicit TParserInputConsumer(NYql::NPureCalc::TWorkerHolder worker) + : Worker(std::move(worker)) { + } + + ~TParserInputConsumer() override { + with_lock(Worker->GetScopedAlloc()) { + Cache.Clear(); + } + } + + void OnObject(std::pair value) override { + NKikimr::NMiniKQL::TThrowingBindTerminator bind; + + with_lock (Worker->GetScopedAlloc()) { + auto& holderFactory = Worker->GetGraph().GetHolderFactory(); + NYql::NUdf::TUnboxedValue* items = nullptr; + + NYql::NUdf::TUnboxedValue result = Cache.NewArray( + holderFactory, + static_cast(2), + items); + + items[0] = NYql::NUdf::TUnboxedValuePod(value.first); + NYql::NUdf::TStringValue str(value.second.Size()); + std::memcpy(str.Data(), value.second.Data(), value.second.Size()); + items[1] = NYql::NUdf::TUnboxedValuePod(std::move(str)); + Worker->Push(std::move(result)); + } + } + + void OnFinish() override { + NKikimr::NMiniKQL::TBindTerminator bind(Worker->GetGraph().GetTerminator()); + with_lock(Worker->GetScopedAlloc()) { + Worker->OnFinish(); + } + } + +private: + NYql::NPureCalc::TWorkerHolder Worker; + NKikimr::NMiniKQL::TPlainContainerCache Cache; +}; + + +class TParserInputSpec : public NYql::NPureCalc::TInputSpecBase { +public: + TParserInputSpec() { + Schemas = {MakeInputSchema()}; + } + + const TVector& GetSchemas() const override { + return Schemas; + } + +private: + TVector Schemas; +}; + + +class TParserOutputConsumer: public NYql::NPureCalc::IConsumer>> { +public: + TParserOutputConsumer(TCallback callback) + : Callback(callback) { + } + + void OnObject(std::pair> value) override { + Callback(value.first, std::move(value.second)); + } + + void OnFinish() override { + Y_UNREACHABLE(); + } +private: + TCallback Callback; +}; + +class TParserOutputSpec: public NYql::NPureCalc::TOutputSpecBase { +public: + explicit TParserOutputSpec(const NYT::TNode& schema) + : Schema(schema) + {} + +public: + const NYT::TNode& GetSchema() const override { + return Schema; + } + +private: + NYT::TNode Schema; +}; + +struct TFieldsMapping{ + TVector FieldsPositions; + size_t OffsetPosition; + + TFieldsMapping(const NYT::TNode& schema, const NKikimr::NMiniKQL::TType* outputType) { + THashMap outputPositions; + Y_ENSURE(outputType->IsStruct()); + const auto structType = static_cast(outputType); + const auto count = structType->GetMembersCount(); + + for (ui32 i = 1; i < count; ++i) { // 0 index - OffsetFieldName + const auto name = structType->GetMemberName(i); + outputPositions[name] = i; + } + + const auto& fields = schema[1]; + Y_ENSURE(fields.IsList()); + Y_ENSURE(count == fields.Size()); + for (size_t i = 0; i < fields.Size(); ++i) { + auto name = fields[i][0].AsString(); + if (name == OffsetFieldName) { + OffsetPosition = i; + continue; + } + FieldsPositions.push_back(outputPositions[name]); + } + } +}; + +class TParserPushRelayImpl: public NYql::NPureCalc::IConsumer { +public: + TParserPushRelayImpl(const TParserOutputSpec& outputSpec, NYql::NPureCalc::IPushStreamWorker* worker, THolder>>> underlying) + : Underlying(std::move(underlying)) + , Worker(worker) + , FieldsMapping(outputSpec.GetSchema(), Worker->GetOutputType()) + { } + +public: + void OnObject(const NYql::NUdf::TUnboxedValue* value) override { + auto unguard = Unguard(Worker->GetScopedAlloc()); + TList result; + + Y_ENSURE(value->GetListLength() == FieldsMapping.FieldsPositions.size() + 1); + ui64 offset = value->GetElement(FieldsMapping.OffsetPosition).Get(); + + for (auto pos : FieldsMapping.FieldsPositions) { + const auto& cell = value->GetElement(pos); + + NYql::NUdf::TStringRef strRef(cell.AsStringRef()); + result.emplace_back(strRef.Data(), strRef.Size()); + } + + Underlying->OnObject(std::make_pair(offset, std::move(result))); + } + + void OnFinish() override { + auto unguard = Unguard(Worker->GetScopedAlloc()); + Underlying->OnFinish(); + } + +private: + THolder>>> Underlying; + NYql::NPureCalc::IWorker* Worker; + TFieldsMapping FieldsMapping; +}; + +} + +template <> +struct NYql::NPureCalc::TInputSpecTraits { + static constexpr bool IsPartial = false; + static constexpr bool SupportPushStreamMode = true; + + using TConsumerType = THolder>; + + static TConsumerType MakeConsumer( + const TParserInputSpec& spec, + NYql::NPureCalc::TWorkerHolder worker + ) { + Y_UNUSED(spec); + return MakeHolder(std::move(worker)); + } +}; + +template <> +struct NYql::NPureCalc::TOutputSpecTraits { + static const constexpr bool IsPartial = false; + static const constexpr bool SupportPushStreamMode = true; + + static void SetConsumerToWorker(const TParserOutputSpec& outputSpec, NYql::NPureCalc::IPushStreamWorker* worker, THolder>>> consumer) { + worker->SetConsumer(MakeHolder(outputSpec, worker, std::move(consumer))); + } +}; + +namespace NFq { + +class TJsonParser::TImpl { +public: + TImpl( + const TVector& columns, + const TVector& types, + TCallback callback) + : Sql(GenerateSql(columns, types)) { + auto options = NYql::NPureCalc::TProgramFactoryOptions(); + auto factory = NYql::NPureCalc::MakeProgramFactory(options); + + LOG_ROW_DISPATCHER_DEBUG("Creating program..."); + Program = factory->MakePushStreamProgram( + TParserInputSpec(), + TParserOutputSpec(MakeOutputSchema(columns)), + Sql, + NYql::NPureCalc::ETranslationMode::SExpr + ); + LOG_ROW_DISPATCHER_DEBUG("Program created"); + InputConsumer = Program->Apply(MakeHolder(callback)); + LOG_ROW_DISPATCHER_DEBUG("InputConsumer created"); + } + + void Push( ui64 offset, const TString& value) { + LOG_ROW_DISPATCHER_TRACE("Push " << value); + InputConsumer->OnObject(std::make_pair(offset, value)); + } + + TString GetSql() const { + return Sql; + } + +private: + TString GenerateSql(const TVector& columnNames, const TVector& columnTypes) { + Y_ABORT_UNLESS(columnNames.size() == columnTypes.size(), "Unexpected column types size"); + + TStringStream udfOutputType; + TStringStream resultType; + for (size_t i = 0; i < columnNames.size(); ++i) { + const TString& lastSymbol = i + 1 == columnNames.size() ? "" : " "; + const TString& column = columnNames[i]; + const TString& type = SkipOptional(columnTypes[i]); + + udfOutputType << "'('" << column << " (DataType '" << type << "))" << lastSymbol; + resultType << "'('" << column << " (SafeCast (Member $parsed '" << column << ") $string_type))" << lastSymbol; + } + + TStringStream str; + str << R"( + ( + (let $string_type (DataType 'String)) + + (let $input_type (TupleType $string_type (DataType 'Uint64))) + (let $output_type (TupleType (StructType )" << udfOutputType.Str() << R"() (DataType 'Uint64))) + (let $udf_argument_type (TupleType $input_type (StructType) $output_type)) + (let $udf_callable_type (CallableType '('1) '((StreamType $output_type)) '((StreamType $input_type)) '((OptionalType (DataType 'Utf8))))) + (let $udf (Udf 'ClickHouseClient.ParseFormat (Void) $udf_argument_type 'json_each_row $udf_callable_type (VoidType) '"" '())) + + (return (Map (Apply $udf (Map (Self '0) (lambda '($input) (block '( + (return '((Member $input 'data) (Member $input ')" << OffsetFieldName << R"())) + ))))) (lambda '($output) (block '( + (let $parsed (Nth $output '0)) + (return (AsStruct '(')" << OffsetFieldName << R"( (Nth $output '1)) )" << resultType.Str() << R"()) + ))))) + ) + )"; + LOG_ROW_DISPATCHER_DEBUG("GenerateSql " << str.Str()); + return str.Str(); + } + + static TString SkipOptional(TStringBuf type) { + if (type.StartsWith("Optional")) { + Y_ABORT_UNLESS(type.SkipPrefix("Optional<")); + Y_ABORT_UNLESS(type.ChopSuffix(">")); + } + return TString(type); + } + +private: + THolder> Program; + THolder> InputConsumer; + const TString Sql; +}; + +TJsonParser::TJsonParser( + const TVector& columns, + const TVector& types, + TCallback callback) + : Impl(std::make_unique(columns, types, callback)) { +} + +TJsonParser::~TJsonParser() { +} + +void TJsonParser::Push(ui64 offset, const TString& value) { + Impl->Push(offset, value); +} + +TString TJsonParser::GetSql() { + return Impl->GetSql(); +} + +std::unique_ptr NewJsonParser( + const TVector& columns, + const TVector& types, + TCallback callback) { + return std::unique_ptr(new TJsonParser(columns, types, callback)); +} + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/json_parser.h b/ydb/core/fq/libs/row_dispatcher/json_parser.h new file mode 100644 index 000000000000..cb5137105e6b --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/json_parser.h @@ -0,0 +1,32 @@ +#pragma once + +#include + +#include + +namespace NFq { + +class TJsonParser { +public: + using TCallback = std::function&&)>; + +public: + TJsonParser( + const TVector& columns, + const TVector& types, + TCallback callback); + ~TJsonParser(); + void Push(ui64 offset, const TString& value); + TString GetSql(); + +private: + class TImpl; + const std::unique_ptr Impl; +}; + +std::unique_ptr NewJsonParser( + const TVector& columns, + const TVector& types, + TJsonParser::TCallback callback); + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/leader_election.cpp b/ydb/core/fq/libs/row_dispatcher/leader_election.cpp new file mode 100644 index 000000000000..6817cfc292c0 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/leader_election.cpp @@ -0,0 +1,482 @@ +#include "coordinator.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace NFq { + +using namespace NActors; +using namespace NThreading; +using NYql::TIssues; + +namespace { + +const ui64 TimeoutDurationSec = 3; +const TString SemaphoreName = "RowDispatcher"; + +struct TEvPrivate { + // Event ids + enum EEv : ui32 { + EvBegin = EventSpaceBegin(NActors::TEvents::ES_PRIVATE), + EvCreateSemaphoreResult = EvBegin, + EvCreateSessionResult, + EvAcquireSemaphoreResult, + EvDescribeSemaphoreResult, + EvSessionStopped, + EvTimeout, + EvOnChangedResult, + EvEnd + }; + + static_assert(EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE), "expect EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE)"); + + // Events + struct TEvCreateSemaphoreResult : NActors::TEventLocal { + NYdb::NCoordination::TAsyncResult Result; + explicit TEvCreateSemaphoreResult(const NYdb::NCoordination::TAsyncResult& future) + : Result(std::move(future)) {} + }; + struct TEvCreateSessionResult : NActors::TEventLocal { + NYdb::NCoordination::TAsyncSessionResult Result; + explicit TEvCreateSessionResult(NYdb::NCoordination::TAsyncSessionResult future) + : Result(std::move(future)) {} + }; + + struct TEvOnChangedResult : NActors::TEventLocal { + bool Result; + explicit TEvOnChangedResult(bool result) + : Result(result) {} + }; + + struct TEvDescribeSemaphoreResult : NActors::TEventLocal { + NYdb::NCoordination::TAsyncDescribeSemaphoreResult Result; + explicit TEvDescribeSemaphoreResult(NYdb::NCoordination::TAsyncDescribeSemaphoreResult future) + : Result(std::move(future)) {} + }; + + struct TEvAcquireSemaphoreResult : NActors::TEventLocal { + NYdb::NCoordination::TAsyncResult Result; + explicit TEvAcquireSemaphoreResult(NYdb::NCoordination::TAsyncResult future) + : Result(std::move(future)) {} + }; + struct TEvSessionStopped : NActors::TEventLocal {}; + struct TEvTimeout : NActors::TEventLocal {}; +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TLeaderElectionMetrics { + explicit TLeaderElectionMetrics(const ::NMonitoring::TDynamicCounterPtr& counters) + : Counters(counters) { + Errors = Counters->GetCounter("LeaderElectionErrors", true); + LeaderChangedCount = Counters->GetCounter("LeaderElectionChangedCount"); + } + + ::NMonitoring::TDynamicCounterPtr Counters; + ::NMonitoring::TDynamicCounters::TCounterPtr Errors; + ::NMonitoring::TDynamicCounters::TCounterPtr LeaderChangedCount; +}; + +class TLeaderElection: public TActorBootstrapped { + + enum class EState { + Init, + WaitNodeCreated, + WaitSessionCreated, + WaitSemaphoreCreated, + Started + }; + NFq::NConfig::TRowDispatcherCoordinatorConfig Config; + const NKikimr::TYdbCredentialsProviderFactory& CredentialsProviderFactory; + TYqSharedResources::TPtr YqSharedResources; + TYdbConnectionPtr YdbConnection; + TString TablePathPrefix; + TString CoordinationNodePath; + TMaybe Session; + TActorId ParentId; + TActorId CoordinatorId; + TString LogPrefix; + const TString Tenant; + EState State = EState::Init; + bool CoordinationNodeCreated = false; + bool SemaphoreCreated = false; + bool TimeoutScheduled = false; + bool PendingDescribe = false; + bool PendingAcquire = false; + + TMaybe LeaderActorId; + + struct NodeInfo { + bool Connected = false; + }; + std::map RowDispatchersByNode; + TLeaderElectionMetrics Metrics; + +public: + TLeaderElection( + NActors::TActorId parentId, + NActors::TActorId coordinatorId, + const NConfig::TRowDispatcherCoordinatorConfig& config, + const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, + const TYqSharedResources::TPtr& yqSharedResources, + const TString& tenant, + const ::NMonitoring::TDynamicCounterPtr& counters); + + void Bootstrap(); + void PassAway() override; + + static constexpr char ActorName[] = "YQ_LEADER_EL"; + + void Handle(NFq::TEvents::TEvSchemaCreated::TPtr& ev); + void Handle(TEvPrivate::TEvCreateSessionResult::TPtr& ev); + void Handle(TEvPrivate::TEvCreateSemaphoreResult::TPtr& ev); + void Handle(TEvPrivate::TEvAcquireSemaphoreResult::TPtr& ev); + void Handle(TEvPrivate::TEvSessionStopped::TPtr& ev); + void Handle(TEvPrivate::TEvTimeout::TPtr&); + void Handle(TEvPrivate::TEvDescribeSemaphoreResult::TPtr& ev); + void Handle(TEvPrivate::TEvOnChangedResult::TPtr& ev); + void HandleException(const std::exception& e); + + STRICT_STFUNC_EXC(StateFunc, + hFunc(NFq::TEvents::TEvSchemaCreated, Handle); + hFunc(TEvPrivate::TEvCreateSessionResult, Handle); + hFunc(TEvPrivate::TEvCreateSemaphoreResult, Handle); + hFunc(TEvPrivate::TEvAcquireSemaphoreResult, Handle); + hFunc(TEvPrivate::TEvOnChangedResult, Handle); + hFunc(TEvPrivate::TEvSessionStopped, Handle); + hFunc(TEvPrivate::TEvTimeout, Handle); + hFunc(TEvPrivate::TEvDescribeSemaphoreResult, Handle); + cFunc(NActors::TEvents::TSystem::Poison, PassAway);, + ExceptionFunc(std::exception, HandleException) + ) + +private: + void CreateSemaphore(); + void AcquireSemaphore(); + void DebugPrint(); + void StartSession(); + void DescribeSemaphore(); + void ProcessState(); + void ResetState(); + void SetTimeout(); +}; + +TLeaderElection::TLeaderElection( + NActors::TActorId parentId, + NActors::TActorId coordinatorId, + const NConfig::TRowDispatcherCoordinatorConfig& config, + const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, + const TYqSharedResources::TPtr& yqSharedResources, + const TString& tenant, + const ::NMonitoring::TDynamicCounterPtr& counters) + : Config(config) + , CredentialsProviderFactory(credentialsProviderFactory) + , YqSharedResources(yqSharedResources) + , YdbConnection(NewYdbConnection(config.GetDatabase(), credentialsProviderFactory, yqSharedResources->UserSpaceYdbDriver)) + , TablePathPrefix(JoinPath(config.GetDatabase().GetDatabase(), config.GetCoordinationNodePath())) + , CoordinationNodePath(JoinPath(TablePathPrefix, tenant)) + , ParentId(parentId) + , CoordinatorId(coordinatorId) + , Tenant(tenant) + , Metrics(counters) { +} + +ERetryErrorClass RetryFunc(const NYdb::TStatus& status) { + if (status.IsSuccess()) { + return ERetryErrorClass::NoRetry; + } + + if (status.IsTransportError()) { + return ERetryErrorClass::ShortRetry; + } + + const NYdb::EStatus st = status.GetStatus(); + if (st == NYdb::EStatus::INTERNAL_ERROR || st == NYdb::EStatus::UNAVAILABLE || + st == NYdb::EStatus::TIMEOUT || st == NYdb::EStatus::BAD_SESSION || + st == NYdb::EStatus::SESSION_EXPIRED || + st == NYdb::EStatus::SESSION_BUSY) { + return ERetryErrorClass::ShortRetry; + } + + if (st == NYdb::EStatus::OVERLOADED) { + return ERetryErrorClass::LongRetry; + } + + return ERetryErrorClass::NoRetry; +} + +TYdbSdkRetryPolicy::TPtr MakeSchemaRetryPolicy() { + static auto policy = TYdbSdkRetryPolicy::GetExponentialBackoffPolicy(RetryFunc, TDuration::MilliSeconds(10), TDuration::Seconds(1), TDuration::Seconds(5)); + return policy; +} + +void TLeaderElection::Bootstrap() { + Become(&TLeaderElection::StateFunc); + LogPrefix = "TLeaderElection " + SelfId().ToString() + " "; + LOG_ROW_DISPATCHER_DEBUG("Successfully bootstrapped, local coordinator id " << CoordinatorId.ToString()); + ProcessState(); +} + +void TLeaderElection::ProcessState() { + switch (State) { + case EState::Init: + if (!CoordinationNodeCreated) { + Register(MakeCreateCoordinationNodeActor( + SelfId(), + NKikimrServices::FQ_ROW_DISPATCHER, + YdbConnection, + CoordinationNodePath, + MakeSchemaRetryPolicy())); + } + State = EState::WaitNodeCreated; + [[fallthrough]]; + case EState::WaitNodeCreated: + if (!CoordinationNodeCreated) { + return; + } + if (!Session) { + StartSession(); + } + State = EState::WaitSessionCreated; + [[fallthrough]]; + case EState::WaitSessionCreated: + if (!Session) { + return; + } + if (!SemaphoreCreated) { + CreateSemaphore(); + } + State = EState::WaitSemaphoreCreated; + [[fallthrough]]; + case EState::WaitSemaphoreCreated: + if (!SemaphoreCreated) { + return; + } + State = EState::Started; + [[fallthrough]]; + case EState::Started: + AcquireSemaphore(); + DescribeSemaphore(); + break; + } +} + +void TLeaderElection::ResetState() { + State = EState::Init; + SetTimeout(); +} + +void TLeaderElection::CreateSemaphore() { + Session->CreateSemaphore(SemaphoreName, 1 /* limit */) + .Subscribe( + [actorId = this->SelfId(), actorSystem = TActivationContext::ActorSystem()](const NYdb::NCoordination::TAsyncResult& future) { + actorSystem->Send(actorId, new TEvPrivate::TEvCreateSemaphoreResult(future)); + }); +} + +void TLeaderElection::AcquireSemaphore() { + if (PendingAcquire) { + return; + } + LOG_ROW_DISPATCHER_DEBUG("Try to acquire semaphore"); + + NActorsProto::TActorId protoId; + ActorIdToProto(CoordinatorId, &protoId); + TString strActorId; + if (!protoId.SerializeToString(&strActorId)) { + Y_ABORT("SerializeToString"); + } + PendingAcquire = true; + Session->AcquireSemaphore( + SemaphoreName, + NYdb::NCoordination::TAcquireSemaphoreSettings().Count(1).Data(strActorId)) + .Subscribe( + [actorId = this->SelfId(), actorSystem = TActivationContext::ActorSystem()](const NYdb::NCoordination::TAsyncResult& future) { + actorSystem->Send(actorId, new TEvPrivate::TEvAcquireSemaphoreResult(future)); + }); +} + +void TLeaderElection::StartSession() { + LOG_ROW_DISPATCHER_DEBUG("Start session"); + + YdbConnection->CoordinationClient + .StartSession( + CoordinationNodePath, + NYdb::NCoordination::TSessionSettings().OnStopped( + [actorId = this->SelfId(), actorSystem = TActivationContext::ActorSystem()]() { + actorSystem->Send(actorId, new TEvPrivate::TEvSessionStopped()); + })) + .Subscribe([actorId = this->SelfId(), actorSystem = TActivationContext::ActorSystem()](const NYdb::NCoordination::TAsyncSessionResult& future) { + actorSystem->Send(actorId, new TEvPrivate::TEvCreateSessionResult(future)); + }); +} + +void TLeaderElection::Handle(NFq::TEvents::TEvSchemaCreated::TPtr& ev) { + if (!IsTableCreated(ev->Get()->Result)) { + LOG_ROW_DISPATCHER_ERROR("Schema creation error " << ev->Get()->Result.GetIssues()); + Metrics.Errors->Inc(); + ResetState(); + return; + } + LOG_ROW_DISPATCHER_DEBUG("Coordination node successfully created"); + CoordinationNodeCreated = true; + ProcessState(); +} + +void TLeaderElection::Handle(TEvPrivate::TEvCreateSessionResult::TPtr& ev) { + auto result = ev->Get()->Result.GetValue(); + if (!result.IsSuccess()) { + LOG_ROW_DISPATCHER_ERROR("CreateSession failed, " << result.GetIssues()); + Metrics.Errors->Inc(); + ResetState(); + return; + } + Session = result.GetResult(); + LOG_ROW_DISPATCHER_DEBUG("Session successfully created"); + ProcessState(); +} + +void TLeaderElection::Handle(TEvPrivate::TEvCreateSemaphoreResult::TPtr& ev) { + auto result = ev->Get()->Result.GetValue(); + if (!IsTableCreated(result)) { + LOG_ROW_DISPATCHER_ERROR("Semaphore creating error " << result.GetIssues()); + Metrics.Errors->Inc(); + ResetState(); + return; + } + SemaphoreCreated = true; + LOG_ROW_DISPATCHER_DEBUG("Semaphore successfully created"); + ProcessState(); +} + +void TLeaderElection::Handle(TEvPrivate::TEvAcquireSemaphoreResult::TPtr& ev) { + auto result = ev->Get()->Result.GetValue(); + PendingAcquire = false; + + if (!result.IsSuccess()) { + LOG_ROW_DISPATCHER_ERROR("Failed to acquire semaphore, " << result.GetIssues()); + Metrics.Errors->Inc(); + ResetState(); + return; + } + LOG_ROW_DISPATCHER_DEBUG("Semaphore successfully acquired"); +} + +void TLeaderElection::PassAway() { + LOG_ROW_DISPATCHER_DEBUG("PassAway"); + TActorBootstrapped::PassAway(); +} + +void TLeaderElection::Handle(TEvPrivate::TEvSessionStopped::TPtr&) { + LOG_ROW_DISPATCHER_DEBUG("TEvSessionStopped"); + Session.Clear(); + PendingAcquire = false; + PendingDescribe = false; + ResetState(); +} + +void TLeaderElection::SetTimeout() { + if (TimeoutScheduled) { + return; + } + TimeoutScheduled = true; + Schedule(TDuration::Seconds(TimeoutDurationSec), new TEvPrivate::TEvTimeout()); +} + +void TLeaderElection::Handle(TEvPrivate::TEvTimeout::TPtr&) { + TimeoutScheduled = false; + LOG_ROW_DISPATCHER_DEBUG("TEvTimeout"); + ProcessState(); +} + +void TLeaderElection::DescribeSemaphore() { + if (PendingDescribe) { + return; + } + LOG_ROW_DISPATCHER_DEBUG("Describe semaphore"); + PendingDescribe = true; + Session->DescribeSemaphore( + SemaphoreName, + NYdb::NCoordination::TDescribeSemaphoreSettings() + .WatchData() + .WatchOwners() + .IncludeOwners() + .OnChanged([actorId = this->SelfId(), actorSystem = TActivationContext::ActorSystem()](bool isChanged) { + actorSystem->Send(actorId, new TEvPrivate::TEvOnChangedResult(isChanged)); + })) + .Subscribe( + [actorId = this->SelfId(), actorSystem = TActivationContext::ActorSystem()](const NYdb::NCoordination::TAsyncDescribeSemaphoreResult& future) { + actorSystem->Send(actorId, new TEvPrivate::TEvDescribeSemaphoreResult(future)); + }); +} + +void TLeaderElection::Handle(TEvPrivate::TEvOnChangedResult::TPtr& /*ev*/) { + LOG_ROW_DISPATCHER_DEBUG("Semaphore changed"); + PendingDescribe = false; + ProcessState(); +} + +void TLeaderElection::Handle(TEvPrivate::TEvDescribeSemaphoreResult::TPtr& ev) { + PendingDescribe = false; + auto result = ev->Get()->Result.GetValue(); + if (!result.IsSuccess()) { + LOG_ROW_DISPATCHER_ERROR("Semaphore describe fail, " << result.GetIssues()); + Metrics.Errors->Inc(); + ResetState(); + return; + } + + const NYdb::NCoordination::TSemaphoreDescription& description = result.GetResult(); + Y_ABORT_UNLESS(description.GetOwners().size() <= 1, "To many owners"); + if (description.GetOwners().empty()) { + LOG_ROW_DISPATCHER_DEBUG("Empty owners"); + // Wait OnChanged. + return; + } + TString data = description.GetOwners()[0].GetData(); + NActorsProto::TActorId protoId; + if (!protoId.ParseFromString(data)) { + Y_ABORT("ParseFromString"); + } + + NActors::TActorId id = ActorIdFromProto(protoId); + LOG_ROW_DISPATCHER_DEBUG("Semaphore successfully described: coordinator id " << id); + if (!LeaderActorId || (*LeaderActorId != id)) { + LOG_ROW_DISPATCHER_INFO("Send TEvCoordinatorChanged to " << ParentId); + TActivationContext::ActorSystem()->Send(ParentId, new NFq::TEvRowDispatcher::TEvCoordinatorChanged(id)); + Metrics.LeaderChangedCount->Inc(); + } + LeaderActorId = id; +} + +void TLeaderElection::HandleException(const std::exception& e) { + LOG_ROW_DISPATCHER_ERROR("Internal error: exception:" << e.what()); + Metrics.Errors->Inc(); + ResetState(); +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr NewLeaderElection( + NActors::TActorId rowDispatcherId, + NActors::TActorId coordinatorId, + const NConfig::TRowDispatcherCoordinatorConfig& config, + const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, + const TYqSharedResources::TPtr& yqSharedResources, + const TString& tenant, + const ::NMonitoring::TDynamicCounterPtr& counters) +{ + return std::unique_ptr(new TLeaderElection(rowDispatcherId, coordinatorId, config, credentialsProviderFactory, yqSharedResources, tenant, counters)); +} + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/leader_election.h b/ydb/core/fq/libs/row_dispatcher/leader_election.h new file mode 100644 index 000000000000..536eecbe65aa --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/leader_election.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include +#include + +namespace NFq { + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr NewLeaderElection( + NActors::TActorId rowDispatcherId, + NActors::TActorId coordinatorId, + const NConfig::TRowDispatcherCoordinatorConfig& config, + const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, + const TYqSharedResources::TPtr& yqSharedResources, + const TString& tenant, + const ::NMonitoring::TDynamicCounterPtr& counters); + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/protos/events.proto b/ydb/core/fq/libs/row_dispatcher/protos/events.proto new file mode 100644 index 000000000000..e832c26c2752 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/protos/events.proto @@ -0,0 +1,78 @@ +syntax = "proto3"; + +package NFq.NRowDispatcherProto; +option cc_enable_arenas = true; + +import "ydb/library/actors/protos/actors.proto"; +import "ydb/library/yql/providers/pq/proto/dq_io.proto"; +import "ydb/library/yql/dq/actors/protos/dq_events.proto"; + +message TEvGetAddressRequest { + NYql.NPq.NProto.TDqPqTopicSource Source = 1; + repeated uint32 PartitionId = 2; +} + +message TEvPartitionAddress { + repeated uint32 PartitionId = 1; + NActorsProto.TActorId ActorId = 2; +} + +message TEvGetAddressResponse { + repeated TEvPartitionAddress Partitions = 1; +} + +message TEvStartSession { + NYql.NPq.NProto.TDqPqTopicSource Source = 1; + uint32 PartitionId = 2; + string Token = 3; + optional uint64 Offset = 4; + uint64 StartingMessageTimestampMs = 5; + string QueryId = 6; + optional NYql.NDqProto.TMessageTransportMeta TransportMeta = 100; +} + +message TEvStartSessionAck { + TEvStartSession Consumer = 1; + optional NYql.NDqProto.TMessageTransportMeta TransportMeta = 100; +} + +message TEvGetNextBatch { + uint32 PartitionId = 1; + optional NYql.NDqProto.TMessageTransportMeta TransportMeta = 100; +} + +message TEvNewDataArrived { + uint32 PartitionId = 1; + optional NYql.NDqProto.TMessageTransportMeta TransportMeta = 100; +} + +message TEvStopSession { + NYql.NPq.NProto.TDqPqTopicSource Source = 1; + uint32 PartitionId = 2; + optional NYql.NDqProto.TMessageTransportMeta TransportMeta = 100; +} + +message TEvMessage { + string Json = 1; + uint64 Offset = 2; +} + +message TEvMessageBatch { + repeated TEvMessage Messages = 1; + uint32 PartitionId = 2; + uint64 NextMessageOffset = 3; + optional NYql.NDqProto.TMessageTransportMeta TransportMeta = 100; +} + +message TEvStatus { + uint32 PartitionId = 1; + uint64 NextMessageOffset = 2; + optional NYql.NDqProto.TMessageTransportMeta TransportMeta = 100; +} + +message TEvSessionError { + string Message = 1; + uint32 PartitionId = 2; + optional NYql.NDqProto.TMessageTransportMeta TransportMeta = 100; +} + diff --git a/ydb/core/fq/libs/row_dispatcher/protos/ya.make b/ydb/core/fq/libs/row_dispatcher/protos/ya.make new file mode 100644 index 000000000000..c2d06e232661 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/protos/ya.make @@ -0,0 +1,15 @@ +PROTO_LIBRARY() + +SRCS( + events.proto +) + +PEERDIR( + ydb/library/actors/protos + ydb/library/yql/dq/actors/protos + ydb/library/yql/providers/pq/proto +) + +EXCLUDE_TAGS(GO_PROTO) + +END() diff --git a/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp new file mode 100644 index 000000000000..3d327385cf0c --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp @@ -0,0 +1,608 @@ +#include "row_dispatcher.h" +#include "coordinator.h" + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + + +namespace NFq { + +using namespace NActors; + +namespace { + +const ui64 CoordinatorPingPeriodSec = 2; + +//////////////////////////////////////////////////////////////////////////////// + +struct TRowDispatcherMetrics { + explicit TRowDispatcherMetrics(const ::NMonitoring::TDynamicCounterPtr& counters) + : Counters(counters) { + ErrorsCount = Counters->GetCounter("ErrorsCount"); + ClientsCount = Counters->GetCounter("ClientsCount"); + RowsSent = Counters->GetCounter("RowsSent", true); + } + + ::NMonitoring::TDynamicCounterPtr Counters; + ::NMonitoring::TDynamicCounters::TCounterPtr ErrorsCount; + ::NMonitoring::TDynamicCounters::TCounterPtr ClientsCount; + ::NMonitoring::TDynamicCounters::TCounterPtr RowsSent; +}; + + +struct TEvPrivate { + // Event ids + enum EEv : ui32 { + EvBegin = EventSpaceBegin(NActors::TEvents::ES_PRIVATE), + EvCoordinatorPing = EvBegin + 20, + EvPrintState, + EvEnd + }; + + static_assert(EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE), "expect EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE)"); + struct TEvCoordinatorPing : NActors::TEventLocal {}; + struct TEvPrintState : public NActors::TEventLocal {}; +}; + +ui64 PrintStatePeriodSec = 60; + +class TRowDispatcher : public TActorBootstrapped { + + struct ConsumerSessionKey { + TActorId ReadActorId; + ui32 PartitionId; + + size_t Hash() const noexcept { + ui64 hash = std::hash()(ReadActorId); + hash = CombineHashes(hash, std::hash()(PartitionId)); + return hash; + } + bool operator==(const ConsumerSessionKey& other) const { + return ReadActorId == other.ReadActorId && PartitionId == other.PartitionId; + } + }; + + struct ConsumerSessionKeyHash { + int operator()(const ConsumerSessionKey& k) const { + return k.Hash(); + } + }; + + struct TopicSessionKey { + TString Endpoint; + TString Database; + TString TopicName; + ui64 PartitionId; + + size_t Hash() const noexcept { + ui64 hash = std::hash()(Endpoint); + hash = CombineHashes(hash, std::hash()(Database)); + hash = CombineHashes(hash, std::hash()(TopicName)); + hash = CombineHashes(hash, std::hash()(PartitionId)); + return hash; + } + bool operator==(const TopicSessionKey& other) const { + return Endpoint == other.Endpoint && Database == other.Database + && TopicName == other.TopicName && PartitionId == other.PartitionId; + } + }; + + struct TopicSessionKeyHash { + int operator()(const TopicSessionKey& k) const { + return k.Hash(); + } + }; + + + NConfig::TRowDispatcherConfig Config; + NConfig::TCommonConfig CommonConfig; + NKikimr::TYdbCredentialsProviderFactory CredentialsProviderFactory; + TYqSharedResources::TPtr YqSharedResources; + TMaybe CoordinatorActorId; + TSet CoordinatorChangedSubscribers; + NYql::ISecuredServiceAccountCredentialsFactory::TPtr CredentialsFactory; + const TString LogPrefix; + ui64 NextEventQueueId = 0; + TString Tenant; + NFq::NRowDispatcher::IActorFactory::TPtr ActorFactory; + const ::NMonitoring::TDynamicCounterPtr Counters; + TRowDispatcherMetrics Metrics; + + struct ConsumerCounters { + ui64 NewDataArrived = 0; + ui64 GetNextBatch = 0; + ui64 MessageBatch = 0; + }; + + struct ConsumerInfo { + ConsumerInfo( + NActors::TActorId readActorId, + NActors::TActorId selfId, + ui64 eventQueueId, + NFq::NRowDispatcherProto::TEvStartSession& proto, + TActorId topicSessionId) + : ReadActorId(readActorId) + , SourceParams(proto.GetSource()) + , PartitionId(proto.GetPartitionId()) + , EventQueueId(eventQueueId) + , Proto(proto) + , TopicSessionId(topicSessionId) + , QueryId(proto.GetQueryId()) { + EventsQueue.Init("txId", selfId, selfId, eventQueueId, /* KeepAlive */ true); + EventsQueue.OnNewRecipientId(readActorId); + } + + NActors::TActorId ReadActorId; + NYql::NPq::NProto::TDqPqTopicSource SourceParams; + ui64 PartitionId; + NYql::NDq::TRetryEventsQueue EventsQueue; + ui64 EventQueueId; + NFq::NRowDispatcherProto::TEvStartSession Proto; + TActorId TopicSessionId; + const TString QueryId; + ConsumerCounters Counters; + }; + + struct SessionInfo { + TMap> Consumers; // key - ReadActor actor id + }; + + struct TopicSessionInfo { + TMap Sessions; // key - TopicSession actor id + }; + + THashMap, ConsumerSessionKeyHash> Consumers; + TMap> ConsumersByEventQueueId; + THashMap TopicSessions; + +public: + explicit TRowDispatcher( + const NConfig::TRowDispatcherConfig& config, + const NConfig::TCommonConfig& commonConfig, + const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, + const TYqSharedResources::TPtr& yqSharedResources, + NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, + const TString& tenant, + const NFq::NRowDispatcher::IActorFactory::TPtr& actorFactory, + const ::NMonitoring::TDynamicCounterPtr& counters); + + void Bootstrap(); + + static constexpr char ActorName[] = "FQ_ROW_DISPATCHER"; + + void Handle(NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr& ev); + void HandleDisconnected(TEvInterconnect::TEvNodeDisconnected::TPtr& ev); + void HandleConnected(TEvInterconnect::TEvNodeConnected::TPtr& ev); + + void Handle(NActors::TEvents::TEvUndelivered::TPtr& ev) ; + void Handle(TEvPrivate::TEvCoordinatorPing::TPtr& ev); + void Handle(NActors::TEvents::TEvPong::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvCoordinatorChangesSubscribe::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvStopSession::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvGetNextBatch::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvNewDataArrived::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvSessionError::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvStatus::TPtr& ev); + + void Handle(NActors::TEvents::TEvPing::TPtr& ev); + void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvRetry::TPtr&); + void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvPing::TPtr&); + void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed::TPtr&); + void Handle(NFq::TEvPrivate::TEvPrintState::TPtr&); + + void DeleteConsumer(const ConsumerSessionKey& key); + void PrintInternalState(); + + STRICT_STFUNC( + StateFunc, { + hFunc(NFq::TEvRowDispatcher::TEvCoordinatorChanged, Handle); + hFunc(TEvInterconnect::TEvNodeConnected, HandleConnected); + hFunc(TEvInterconnect::TEvNodeDisconnected, HandleDisconnected); + hFunc(NActors::TEvents::TEvUndelivered, Handle); + hFunc(TEvPrivate::TEvCoordinatorPing, Handle) + hFunc(NActors::TEvents::TEvPong, Handle); + hFunc(NFq::TEvRowDispatcher::TEvCoordinatorChangesSubscribe, Handle); + hFunc(NFq::TEvRowDispatcher::TEvGetNextBatch, Handle); + hFunc(NFq::TEvRowDispatcher::TEvMessageBatch, Handle); + hFunc(NFq::TEvRowDispatcher::TEvStartSession, Handle); + hFunc(NFq::TEvRowDispatcher::TEvStopSession, Handle); + hFunc(NFq::TEvRowDispatcher::TEvSessionError, Handle); + hFunc(NFq::TEvRowDispatcher::TEvStatus, Handle); + hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvRetry, Handle); + hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvPing, Handle); + hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed, Handle); + hFunc(NActors::TEvents::TEvPing, Handle); + hFunc(NFq::TEvRowDispatcher::TEvNewDataArrived, Handle); + hFunc(NFq::TEvPrivate::TEvPrintState, Handle); + }) +}; + +TRowDispatcher::TRowDispatcher( + const NConfig::TRowDispatcherConfig& config, + const NConfig::TCommonConfig& commonConfig, + const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, + const TYqSharedResources::TPtr& yqSharedResources, + NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, + const TString& tenant, + const NFq::NRowDispatcher::IActorFactory::TPtr& actorFactory, + const ::NMonitoring::TDynamicCounterPtr& counters) + : Config(config) + , CommonConfig(commonConfig) + , CredentialsProviderFactory(credentialsProviderFactory) + , YqSharedResources(yqSharedResources) + , CredentialsFactory(credentialsFactory) + , LogPrefix("RowDispatcher: ") + , Tenant(tenant) + , ActorFactory(actorFactory) + , Counters(counters) + , Metrics(counters) { +} + +void TRowDispatcher::Bootstrap() { + Become(&TRowDispatcher::StateFunc); + LOG_ROW_DISPATCHER_DEBUG("Successfully bootstrapped row dispatcher, id " << SelfId() << ", tenant " << Tenant); + + const auto& config = Config.GetCoordinator(); + auto coordinatorId = Register(NewCoordinator(SelfId(), config, YqSharedResources, Tenant, Counters).release()); + Register(NewLeaderElection(SelfId(), coordinatorId, config, CredentialsProviderFactory, YqSharedResources, Tenant, Counters).release()); + Schedule(TDuration::Seconds(CoordinatorPingPeriodSec), new TEvPrivate::TEvCoordinatorPing()); + Schedule(TDuration::Seconds(PrintStatePeriodSec), new NFq::TEvPrivate::TEvPrintState()); +} + +void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("Coordinator changed, old leader " << CoordinatorActorId << ", new " << ev->Get()->CoordinatorActorId); + + CoordinatorActorId = ev->Get()->CoordinatorActorId; + Send(*CoordinatorActorId, new NActors::TEvents::TEvPing(), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession); + for (auto actorId : CoordinatorChangedSubscribers) { + Send( + actorId, + new NFq::TEvRowDispatcher::TEvCoordinatorChanged(ev->Get()->CoordinatorActorId), + IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession); + } +} + +void TRowDispatcher::HandleConnected(TEvInterconnect::TEvNodeConnected::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("EvNodeConnected, node id " << ev->Get()->NodeId); + for (auto& [actorId, consumer] : Consumers) { + consumer->EventsQueue.HandleNodeConnected(ev->Get()->NodeId); + } +} + +void TRowDispatcher::HandleDisconnected(TEvInterconnect::TEvNodeDisconnected::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("TEvNodeDisconnected, node id " << ev->Get()->NodeId); + for (auto& [actorId, consumer] : Consumers) { + consumer->EventsQueue.HandleNodeDisconnected(ev->Get()->NodeId); + } +} + +void TRowDispatcher::Handle(NActors::TEvents::TEvUndelivered::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("TEvUndelivered, ev: " << ev->Get()->ToString() << ", reason " << ev->Get()->Reason); + for (auto& [actorId, consumer] : Consumers) { + consumer->EventsQueue.HandleUndelivered(ev); + } +} + +void TRowDispatcher::Handle(TEvPrivate::TEvCoordinatorPing::TPtr&) { + Schedule(TDuration::Seconds(CoordinatorPingPeriodSec), new TEvPrivate::TEvCoordinatorPing()); + if (!CoordinatorActorId) { + return; + } + LOG_ROW_DISPATCHER_DEBUG("Send ping to " << *CoordinatorActorId); + Send(*CoordinatorActorId, new NActors::TEvents::TEvPing(), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession); +} + +void TRowDispatcher::Handle(NActors::TEvents::TEvPong::TPtr&) { + LOG_ROW_DISPATCHER_TRACE("NActors::TEvents::TEvPong "); +} + +void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvCoordinatorChangesSubscribe::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("TEvCoordinatorChangesSubscribe from " << ev->Sender); + CoordinatorChangedSubscribers.insert(ev->Sender); + if (!CoordinatorActorId) { + return; + } + Send(ev->Sender, new NFq::TEvRowDispatcher::TEvCoordinatorChanged(*CoordinatorActorId), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession); +} + +void TRowDispatcher::PrintInternalState() { + if (Consumers.empty()) { + return; + } + TStringStream str; + str << "Consumers:\n"; + for (auto& [key, consumerInfo] : Consumers) { + str << " query id " << consumerInfo->QueryId << ", partId: " << key.PartitionId << ", read actor id: " << key.ReadActorId + << ", queueId " << consumerInfo->EventQueueId << ", get next " << consumerInfo->Counters.GetNextBatch + << ", data arrived " << consumerInfo->Counters.NewDataArrived << ", message batch " << consumerInfo->Counters.MessageBatch << "\n"; + str << " "; + consumerInfo->EventsQueue.PrintInternalState(str); + } + + str << "\nSessions:\n"; + for (auto& [key, sessionInfo1] : TopicSessions) { + str << " " << key.Endpoint << " / " << key.Database << " / " << key.TopicName << ", id: " << key.PartitionId << "\n"; + for (auto& [actorId, sessionInfo2] : sessionInfo1.Sessions) { + str << " session id: " << actorId << "\n"; + for (auto& [actorId2, consumer] : sessionInfo2.Consumers) { + str << " read actor id: " << actorId2 << "\n"; + } + } + } + LOG_ROW_DISPATCHER_DEBUG(str.Str()); +} + +void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("TEvStartSession from " << ev->Sender << ", topicPath " << ev->Get()->Record.GetSource().GetTopicPath() << + " partitionId " << ev->Get()->Record.GetPartitionId()); + + TMaybe readOffset; + if (ev->Get()->Record.HasOffset()) { + readOffset = ev->Get()->Record.GetOffset(); + } + + ConsumerSessionKey key{ev->Sender, ev->Get()->Record.GetPartitionId()}; + auto it = Consumers.find(key); + if (it != Consumers.end()) { + LOG_ROW_DISPATCHER_ERROR("Сonsumer already exists, ignore StartSession"); + return; + } + const auto& source = ev->Get()->Record.GetSource(); + + TActorId sessionActorId; + TopicSessionKey topicKey{source.GetEndpoint(), source.GetDatabase(), source.GetTopicPath(), ev->Get()->Record.GetPartitionId()}; + TopicSessionInfo& topicSessionInfo = TopicSessions[topicKey]; + LOG_ROW_DISPATCHER_DEBUG("Topic session count " << topicSessionInfo.Sessions.size()); + Y_ENSURE(topicSessionInfo.Sessions.size() <= 1); + + auto consumerInfo = MakeAtomicShared(ev->Sender, SelfId(), NextEventQueueId++, ev->Get()->Record, TActorId()); + Consumers[key] = consumerInfo; + ConsumersByEventQueueId[consumerInfo->EventQueueId] = consumerInfo; + if (!consumerInfo->EventsQueue.OnEventReceived(ev)) { + const NYql::NDqProto::TMessageTransportMeta& meta = ev->Get()->Record.GetTransportMeta(); + const ui64 seqNo = meta.GetSeqNo(); + LOG_ROW_DISPATCHER_ERROR("TEvStartSession: wrong seq num from " << ev->Sender.ToString() << ", seqNo " << seqNo << ", ignore message"); + } + + if (topicSessionInfo.Sessions.empty()) { + LOG_ROW_DISPATCHER_DEBUG("Create new session " << readOffset); + sessionActorId = ActorFactory->RegisterTopicSession( + source.GetTopicPath(), + Config, + SelfId(), + ev->Get()->Record.GetPartitionId(), + YqSharedResources->UserSpaceYdbDriver, + CreateCredentialsProviderFactoryForStructuredToken( + CredentialsFactory, + ev->Get()->Record.GetToken(), + source.GetAddBearerToToken()), + Counters); + SessionInfo& sessionInfo = topicSessionInfo.Sessions[sessionActorId]; + sessionInfo.Consumers[ev->Sender] = consumerInfo; + } else { + auto sessionIt = topicSessionInfo.Sessions.begin(); + SessionInfo& sessionInfo = sessionIt->second; + sessionInfo.Consumers[ev->Sender] = consumerInfo; + sessionActorId = sessionIt->first; + } + consumerInfo->TopicSessionId = sessionActorId; + consumerInfo->EventsQueue.Send(new NFq::TEvRowDispatcher::TEvStartSessionAck(consumerInfo->Proto)); + + Forward(ev, sessionActorId); + Metrics.ClientsCount->Set(Consumers.size()); + PrintInternalState(); +} + +void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvGetNextBatch::TPtr& ev) { + const NYql::NDqProto::TMessageTransportMeta& meta = ev->Get()->Record.GetTransportMeta(); + LOG_ROW_DISPATCHER_TRACE("TEvGetNextBatch from " << ev->Sender << ", partId " << ev->Get()->Record.GetPartitionId() << ", seqNo " << meta.GetSeqNo() << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo()); + + ConsumerSessionKey key{ev->Sender, ev->Get()->Record.GetPartitionId()}; + auto it = Consumers.find(key); + if (it == Consumers.end()) { + LOG_ROW_DISPATCHER_WARN("Ignore TEvGetNextBatch, no such session"); + return; + } + if (!it->second->EventsQueue.OnEventReceived(ev)) { + const NYql::NDqProto::TMessageTransportMeta& meta = ev->Get()->Record.GetTransportMeta(); + const ui64 seqNo = meta.GetSeqNo(); + LOG_ROW_DISPATCHER_ERROR("TEvGetNextBatch: wrong seq num from " << ev->Sender.ToString() << ", seqNo " << seqNo << ", ignore message"); + return; + } + it->second->Counters.GetNextBatch++; + Forward(ev, it->second->TopicSessionId); +} + +void TRowDispatcher::Handle(NActors::TEvents::TEvPing::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvPing from " << ev->Sender); + Send(ev->Sender, new NActors::TEvents::TEvPong()); +} + +void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvStopSession::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("TEvStopSession, topicPath " << ev->Get()->Record.GetSource().GetTopicPath() << + " partitionId " << ev->Get()->Record.GetPartitionId()); + + ConsumerSessionKey key{ev->Sender, ev->Get()->Record.GetPartitionId()}; + auto it = Consumers.find(key); + if (it == Consumers.end()) { + LOG_ROW_DISPATCHER_WARN("Wrong consumer, sender " << ev->Sender << ", part id " << ev->Get()->Record.GetPartitionId()); + return; + } + if (!it->second->EventsQueue.OnEventReceived(ev)) { + const NYql::NDqProto::TMessageTransportMeta& meta = ev->Get()->Record.GetTransportMeta(); + const ui64 seqNo = meta.GetSeqNo(); + + LOG_ROW_DISPATCHER_ERROR("TEvStopSession: wrong seq num from " << ev->Sender.ToString() << ", seqNo " << seqNo << ", ignore message"); + return; + } + DeleteConsumer(key); +} + +void TRowDispatcher::DeleteConsumer(const ConsumerSessionKey& key) { + LOG_ROW_DISPATCHER_DEBUG("DeleteConsumer, readActorId " << key.ReadActorId << + " partitionId " << key.PartitionId); + + auto consumerIt = Consumers.find(key); + if (consumerIt == Consumers.end()) { + LOG_ROW_DISPATCHER_WARN("Ignore DeleteConsumer, no such session"); + return; + } + const auto& consumer = consumerIt->second; + auto event = std::make_unique(); + *event->Record.MutableSource() = consumer->SourceParams; + event->Record.SetPartitionId(consumer->PartitionId); + Send(new IEventHandle(consumerIt->second->TopicSessionId, consumer->ReadActorId, event.release(), 0)); + + TopicSessionKey topicKey{ + consumer->SourceParams.GetEndpoint(), + consumer->SourceParams.GetDatabase(), + consumer->SourceParams.GetTopicPath(), + consumer->PartitionId}; + TopicSessionInfo& topicSessionInfo = TopicSessions[topicKey]; + SessionInfo& sessionInfo = topicSessionInfo.Sessions[consumerIt->second->TopicSessionId]; + Y_ENSURE(sessionInfo.Consumers.count(consumer->ReadActorId)); + sessionInfo.Consumers.erase(consumer->ReadActorId); + if (sessionInfo.Consumers.empty()) { + LOG_ROW_DISPATCHER_DEBUG("Session is not used, sent TEvPoisonPill"); + topicSessionInfo.Sessions.erase(consumerIt->second->TopicSessionId); + Send(consumerIt->second->TopicSessionId, new NActors::TEvents::TEvPoisonPill()); + if (topicSessionInfo.Sessions.empty()) { + TopicSessions.erase(topicKey); + } + } + ConsumersByEventQueueId.erase(consumerIt->second->EventQueueId); + Consumers.erase(consumerIt); + Metrics.ClientsCount->Set(Consumers.size()); + PrintInternalState(); +} + +void TRowDispatcher::Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed::TPtr& ev) { + LOG_ROW_DISPATCHER_WARN("Session closed, event queue id " << ev->Get()->EventQueueId); + for (auto& [consumerKey, consumer] : Consumers) { + if (consumer->EventQueueId != ev->Get()->EventQueueId) { + continue; + } + DeleteConsumer(consumerKey); + break; + } +} + +void TRowDispatcher::Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvRetry::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvRetry " << ev->Get()->EventQueueId); + auto it = ConsumersByEventQueueId.find(ev->Get()->EventQueueId); + if (it == ConsumersByEventQueueId.end()) { + LOG_ROW_DISPATCHER_WARN("No consumer with EventQueueId = " << ev->Get()->EventQueueId); + return; + } + it->second->EventsQueue.Retry(); +} + +void TRowDispatcher::Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvPing::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvRetryQueuePrivate::TEvPing " << ev->Get()->EventQueueId); + auto it = ConsumersByEventQueueId.find(ev->Get()->EventQueueId); + if (it == ConsumersByEventQueueId.end()) { + LOG_ROW_DISPATCHER_WARN("No consumer with EventQueueId = " << ev->Get()->EventQueueId); + return; + } + it->second->EventsQueue.Ping(); +} + +void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvNewDataArrived::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvNewDataArrived from " << ev->Sender); + ConsumerSessionKey key{ev->Get()->ReadActorId, ev->Get()->Record.GetPartitionId()}; + auto it = Consumers.find(key); + if (it == Consumers.end()) { + LOG_ROW_DISPATCHER_WARN("Ignore TEvNewDataArrived, no such session"); + return; + } + LOG_ROW_DISPATCHER_TRACE("Forward TEvNewDataArrived to " << ev->Get()->ReadActorId); + it->second->Counters.NewDataArrived++; + it->second->EventsQueue.Send(ev.Release()->Release().Release()); +} + +void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvMessageBatch from " << ev->Sender); + ConsumerSessionKey key{ev->Get()->ReadActorId, ev->Get()->Record.GetPartitionId()}; + auto it = Consumers.find(key); + if (it == Consumers.end()) { + LOG_ROW_DISPATCHER_WARN("Ignore MessageBatch, no such session"); + return; + } + Metrics.RowsSent->Add(ev->Get()->Record.MessagesSize()); + LOG_ROW_DISPATCHER_TRACE("Forward TEvMessageBatch to " << ev->Get()->ReadActorId); + it->second->Counters.MessageBatch++; + it->second->EventsQueue.Send(ev.Release()->Release().Release()); +} + +void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvSessionError::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvSessionError from " << ev->Sender); + ConsumerSessionKey key{ev->Get()->ReadActorId, ev->Get()->Record.GetPartitionId()}; + auto it = Consumers.find(key); + if (it == Consumers.end()) { + LOG_ROW_DISPATCHER_WARN("Ignore MessageBatch, no such session"); + return; + } + Metrics.ErrorsCount->Inc(); + LOG_ROW_DISPATCHER_TRACE("Forward TEvSessionError to " << ev->Get()->ReadActorId); + it->second->EventsQueue.Send(ev.Release()->Release().Release()); + DeleteConsumer(key); +} + +void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvStatus::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvStatus from " << ev->Sender); + ConsumerSessionKey key{ev->Get()->ReadActorId, ev->Get()->Record.GetPartitionId()}; + auto it = Consumers.find(key); + if (it == Consumers.end()) { + LOG_ROW_DISPATCHER_WARN("Ignore TEvStatus, no such session"); + return; + } + LOG_ROW_DISPATCHER_TRACE("Forward TEvStatus to " << ev->Get()->ReadActorId); + it->second->EventsQueue.Send(ev.Release()->Release().Release()); +} + +void TRowDispatcher::Handle(NFq::TEvPrivate::TEvPrintState::TPtr&) { + Schedule(TDuration::Seconds(PrintStatePeriodSec), new NFq::TEvPrivate::TEvPrintState()); + PrintInternalState(); +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr NewRowDispatcher( + const NConfig::TRowDispatcherConfig& config, + const NConfig::TCommonConfig& commonConfig, + const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, + const TYqSharedResources::TPtr& yqSharedResources, + NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, + const TString& tenant, + const NFq::NRowDispatcher::IActorFactory::TPtr& actorFactory, + const ::NMonitoring::TDynamicCounterPtr& counters) +{ + return std::unique_ptr(new TRowDispatcher( + config, + commonConfig, + credentialsProviderFactory, + yqSharedResources, + credentialsFactory, + tenant, + actorFactory, + counters)); +} + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/row_dispatcher.h b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.h new file mode 100644 index 000000000000..54c3b1521afd --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include + +#include + +namespace NFq { + +std::unique_ptr NewRowDispatcher( + const NConfig::TRowDispatcherConfig& config, + const NConfig::TCommonConfig& commonConfig, + const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, + const TYqSharedResources::TPtr& yqSharedResources, + NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, + const TString& tenant, + const NFq::NRowDispatcher::IActorFactory::TPtr& actorFactory, + const ::NMonitoring::TDynamicCounterPtr& counters); + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.cpp b/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.cpp new file mode 100644 index 000000000000..1300f419d7de --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.cpp @@ -0,0 +1,32 @@ +#include "row_dispatcher_service.h" +#include "actors_factory.h" + +#include "row_dispatcher.h" + +namespace NFq { + +using namespace NActors; + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr NewRowDispatcherService( + const NConfig::TRowDispatcherConfig& config, + const NConfig::TCommonConfig& commonConfig, + const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, + const TYqSharedResources::TPtr& yqSharedResources, + NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, + const TString& tenant, + const ::NMonitoring::TDynamicCounterPtr& counters) +{ + return NewRowDispatcher( + config, + commonConfig, + credentialsProviderFactory, + yqSharedResources, + credentialsFactory, + tenant, + NFq::NRowDispatcher::CreateActorFactory(), + counters); +} + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.h b/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.h new file mode 100644 index 000000000000..ef8a9f29099d --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include "events/data_plane.h" + +#include + +#include + +namespace NFq { + +std::unique_ptr NewRowDispatcherService( + const NConfig::TRowDispatcherConfig& config, + const NConfig::TCommonConfig& commonConfig, + const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, + const TYqSharedResources::TPtr& yqSharedResources, + NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, + const TString& tenant, + const ::NMonitoring::TDynamicCounterPtr& counters); + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/topic_session.cpp b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp new file mode 100644 index 000000000000..9623806ee87a --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp @@ -0,0 +1,777 @@ +#include "topic_session.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace NFq { + +using namespace NActors; + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +struct TTopicSessionMetrics { + void Init(const ::NMonitoring::TDynamicCounterPtr& counters, NActors::TActorId selfId) { + SelfId = selfId; + SubGroup = counters->GetSubgroup("actor_id", SelfId.ToString()); + InFlyAsyncInputData = SubGroup->GetCounter("InFlyAsyncInputData"); + RowsRead = SubGroup->GetCounter("RowsRead", true); + InFlySubscribe = SubGroup->GetCounter("InFlySubscribe"); + } + + ~TTopicSessionMetrics() { + SubGroup->RemoveSubgroup("actor_id", SelfId.ToString()); + } + NActors::TActorId SelfId; + ::NMonitoring::TDynamicCounterPtr SubGroup; + ::NMonitoring::TDynamicCounters::TCounterPtr InFlyAsyncInputData; + ::NMonitoring::TDynamicCounters::TCounterPtr RowsRead; + ::NMonitoring::TDynamicCounters::TCounterPtr InFlySubscribe; +}; + +struct TEvPrivate { + // Event ids + enum EEv : ui32 { + EvBegin = EventSpaceBegin(NActors::TEvents::ES_PRIVATE), + EvPqEventsReady = EvBegin + 10, + EvCreateSession, + EvStatus, + EvDataParsed, + EvDataAfterFilteration, + EvDataFiltered, + EvPrintState, + EvEnd + }; + static_assert(EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE), "expect EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE)"); + + // Events + struct TEvPqEventsReady : public NActors::TEventLocal {}; + struct TEvCreateSession : public NActors::TEventLocal {}; + struct TEvPrintState : public NActors::TEventLocal {}; + struct TEvStatus : public NActors::TEventLocal {}; + struct TEvDataParsed : public NActors::TEventLocal { + TEvDataParsed(ui64 offset, TList&& value) + : Offset(offset) + , Value(std::move(value)) + {} + ui64 Offset = 0; + TList Value; + }; + + struct TEvDataFiltered : public NActors::TEventLocal { + TEvDataFiltered(ui64 offset) + : Offset(offset) + {} + ui64 Offset = 0; + }; + + struct TEvDataAfterFilteration : public NActors::TEventLocal { + TEvDataAfterFilteration(ui64 offset, const TString& json, TActorId readActorId) + : Offset(offset) + , Json(json) + , ReadActorId(readActorId) { } + ui64 Offset; + TString Json; + TActorId ReadActorId; + }; +}; + +ui64 PrintStatePeriodSec = 60; +ui64 MaxBatchSizeBytes = 10000000; + +TVector GetVector(const google::protobuf::RepeatedPtrField& value) { + return {value.begin(), value.end()}; +} + +class TTopicSession : public TActorBootstrapped { + +private: + using TParserInputType = std::pair< TVector, TVector>; // TODO: remove after YQ-3594 + + struct ClientsInfo { + ClientsInfo(const NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) + : Settings(ev->Get()->Record) + , ReadActorId(ev->Sender) + { + if (Settings.HasOffset()) { + NextMessageOffset = Settings.GetOffset(); + } + } + NFq::NRowDispatcherProto::TEvStartSession Settings; + NActors::TActorId ReadActorId; + std::unique_ptr Filter; // empty if no predicate + TQueue> Buffer; + ui64 UsedSize = 0; + bool DataArrivedSent = false; + TMaybe NextMessageOffset; + ui64 LastSendedNextMessageOffset = 0; + }; + + struct TTopicEventProcessor { + void operator()(NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent& event); + void operator()(NYdb::NTopic::TSessionClosedEvent& event); + void operator()(NYdb::NTopic::TReadSessionEvent::TStartPartitionSessionEvent& event); + void operator()(NYdb::NTopic::TReadSessionEvent::TStopPartitionSessionEvent& event); + void operator()(NYdb::NTopic::TReadSessionEvent::TEndPartitionSessionEvent& event); + void operator()(NYdb::NTopic::TReadSessionEvent::TPartitionSessionClosedEvent& event); + void operator()(NYdb::NTopic::TReadSessionEvent::TCommitOffsetAcknowledgementEvent&) {} + void operator()(NYdb::NTopic::TReadSessionEvent::TPartitionSessionStatusEvent&) { } + + TTopicSession& Self; + const TString& LogPrefix; + }; + + const TString TopicPath; + NActors::TActorId RowDispatcherActorId; + ui32 PartitionId; + NYdb::TDriver Driver; + std::shared_ptr CredentialsProviderFactory; + std::unique_ptr TopicClient; + std::shared_ptr ReadSession; + const i64 BufferSize; + TString LogPrefix; + NYql::NDq::TDqAsyncStats IngressStats; + ui64 LastMessageOffset = 0; + bool IsWaitingEvents = false; + THashMap Clients; + THashSet ClientsWithoutPredicate; + std::unique_ptr Parser; + NConfig::TRowDispatcherConfig Config; + ui64 UsedSize = 0; + TMaybe CurrentParserTypes; + const ::NMonitoring::TDynamicCounterPtr Counters; + TTopicSessionMetrics Metrics; + +public: + explicit TTopicSession( + const TString& topicPath, + const NConfig::TRowDispatcherConfig& config, + NActors::TActorId rowDispatcherActorId, + ui32 partitionId, + NYdb::TDriver driver, + std::shared_ptr credentialsProviderFactory, + const ::NMonitoring::TDynamicCounterPtr& counters); + + void Bootstrap(); + void PassAway() override; + + static constexpr char ActorName[] = "FQ_ROW_DISPATCHER_SESSION"; + +private: + NYdb::NTopic::TTopicClientSettings GetTopicClientSettings(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) const; + NYdb::NTopic::TTopicClient& GetTopicClient(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams); + NYdb::NTopic::TReadSessionSettings GetReadSessionSettings(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) const; + void CreateTopicSession(); + void CloseTopicSession(); + void SubscribeOnNextEvent(); + void SendToParsing(ui64 offset, const TString& message); + void SendData(ClientsInfo& info); + void InitParser(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams); + void FatalError(const TString& message, const std::unique_ptr* filter = nullptr); + void SendDataArrived(ClientsInfo& client); + void StopReadSession(); + TString GetSessionId() const; + void HandleNewEvents(); + TInstant GetMinStartingMessageTimestamp() const; + void AddDataToClient(ClientsInfo& client, ui64 offset, const TString& json); + + std::pair CreateItem(const NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage& message); + + void Handle(NFq::TEvPrivate::TEvPqEventsReady::TPtr&); + void Handle(NFq::TEvPrivate::TEvCreateSession::TPtr&); + void Handle(NFq::TEvPrivate::TEvDataParsed::TPtr&); + void Handle(NFq::TEvPrivate::TEvDataAfterFilteration::TPtr&); + void Handle(NFq::TEvPrivate::TEvStatus::TPtr&); + void Handle(NFq::TEvPrivate::TEvDataFiltered::TPtr&); + void Handle(NFq::TEvPrivate::TEvPrintState::TPtr&); + void Handle(TEvRowDispatcher::TEvGetNextBatch::TPtr&); + void Handle(NFq::TEvRowDispatcher::TEvStopSession::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev); + void HandleException(const std::exception& err); + + void PrintInternalState(); + void SendSessionError(NActors::TActorId readActorId, const TString& message); + +private: + + STRICT_STFUNC_EXC(StateFunc, + hFunc(NFq::TEvPrivate::TEvPqEventsReady, Handle); + hFunc(NFq::TEvPrivate::TEvCreateSession, Handle); + hFunc(NFq::TEvPrivate::TEvDataParsed, Handle); + hFunc(NFq::TEvPrivate::TEvDataAfterFilteration, Handle); + hFunc(NFq::TEvPrivate::TEvStatus, Handle); + hFunc(NFq::TEvPrivate::TEvDataFiltered, Handle); + hFunc(NFq::TEvPrivate::TEvPrintState, Handle); + hFunc(TEvRowDispatcher::TEvGetNextBatch, Handle); + hFunc(NFq::TEvRowDispatcher::TEvStartSession, Handle); + cFunc(NActors::TEvents::TEvPoisonPill::EventType, PassAway); + hFunc(NFq::TEvRowDispatcher::TEvStopSession, Handle);, + ExceptionFunc(std::exception, HandleException) + ) + + STRICT_STFUNC(ErrorState, { + cFunc(NActors::TEvents::TEvPoisonPill::EventType, PassAway); + IgnoreFunc(NFq::TEvPrivate::TEvPqEventsReady); + IgnoreFunc(NFq::TEvPrivate::TEvCreateSession); + IgnoreFunc(NFq::TEvPrivate::TEvDataAfterFilteration); + IgnoreFunc(NFq::TEvPrivate::TEvStatus); + IgnoreFunc(NFq::TEvPrivate::TEvDataFiltered); + IgnoreFunc(TEvRowDispatcher::TEvGetNextBatch); + IgnoreFunc(NFq::TEvRowDispatcher::TEvStartSession); + IgnoreFunc(NFq::TEvRowDispatcher::TEvStopSession); + IgnoreFunc(NFq::TEvPrivate::TEvPrintState); + }) +}; + +TTopicSession::TTopicSession( + const TString& topicPath, + const NConfig::TRowDispatcherConfig& config, + NActors::TActorId rowDispatcherActorId, + ui32 partitionId, + NYdb::TDriver driver, + std::shared_ptr credentialsProviderFactory, + const ::NMonitoring::TDynamicCounterPtr& counters) + : TopicPath(topicPath) + , RowDispatcherActorId(rowDispatcherActorId) + , PartitionId(partitionId) + , Driver(std::move(driver)) + , CredentialsProviderFactory(credentialsProviderFactory) + , BufferSize(16_MB) + , LogPrefix("TopicSession") + , Config(config) + , Counters(counters) +{ +} + +void TTopicSession::Bootstrap() { + Become(&TTopicSession::StateFunc); + Metrics.Init(Counters, SelfId()); + LogPrefix = LogPrefix + " " + SelfId().ToString() + " "; + LOG_ROW_DISPATCHER_DEBUG("Bootstrap " << ", PartitionId " << PartitionId + << ", Timeout " << Config.GetTimeoutBeforeStartSessionSec() << " sec, StatusPeriod " << Config.GetSendStatusPeriodSec() << " sec"); + Y_ENSURE(Config.GetSendStatusPeriodSec() > 0); + Schedule(TDuration::Seconds(Config.GetSendStatusPeriodSec()), new NFq::TEvPrivate::TEvStatus()); + Schedule(TDuration::Seconds(PrintStatePeriodSec), new NFq::TEvPrivate::TEvPrintState()); +} + +void TTopicSession::PassAway() { + LOG_ROW_DISPATCHER_DEBUG("PassAway"); + StopReadSession(); + NActors::TActorBootstrapped::PassAway(); +} + +void TTopicSession::SubscribeOnNextEvent() { + if (!ReadSession || IsWaitingEvents) { + return; + } + + if (Config.GetMaxSessionUsedMemory() && UsedSize > Config.GetMaxSessionUsedMemory()) { + LOG_ROW_DISPATCHER_TRACE("Too much used memory (" << UsedSize << " bytes), skip subscribing to WaitEvent()"); + return; + } + + LOG_ROW_DISPATCHER_TRACE("SubscribeOnNextEvent"); + IsWaitingEvents = true; + Metrics.InFlySubscribe->Inc(); + NActors::TActorSystem* actorSystem = NActors::TActivationContext::ActorSystem(); + ReadSession->WaitEvent().Subscribe([actorSystem, selfId = SelfId()](const auto&){ + actorSystem->Send(selfId, new NFq::TEvPrivate::TEvPqEventsReady()); + }); +} + +NYdb::NTopic::TTopicClientSettings TTopicSession::GetTopicClientSettings(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) const { + NYdb::NTopic::TTopicClientSettings opts; + opts.Database(sourceParams.GetDatabase()) + .DiscoveryEndpoint(sourceParams.GetEndpoint()) + .SslCredentials(NYdb::TSslCredentials(sourceParams.GetUseSsl())) + .CredentialsProviderFactory(CredentialsProviderFactory); + return opts; +} + +NYdb::NTopic::TTopicClient& TTopicSession::GetTopicClient(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) { + if (!TopicClient) { + TopicClient = std::make_unique(Driver, GetTopicClientSettings(sourceParams)); + } + return *TopicClient; +} + +TInstant TTopicSession::GetMinStartingMessageTimestamp() const { + auto result = TInstant::Max(); + Y_ENSURE(!Clients.empty()); + for (const auto& [actorId, info] : Clients) { + ui64 time = info.Settings.GetStartingMessageTimestampMs(); + result = std::min(result, TInstant::MilliSeconds(time)); + } + return result; +} + +NYdb::NTopic::TReadSessionSettings TTopicSession::GetReadSessionSettings(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) const { + NYdb::NTopic::TTopicReadSettings topicReadSettings; + topicReadSettings.Path(TopicPath); + topicReadSettings.AppendPartitionIds(PartitionId); + + TInstant minTime = GetMinStartingMessageTimestamp(); + LOG_ROW_DISPATCHER_INFO("Create topic session, Path " << TopicPath + << ", StartingMessageTimestamp " << minTime + << ", BufferSize " << BufferSize << ", WithoutConsumer " << Config.GetWithoutConsumer()); + + auto settings = NYdb::NTopic::TReadSessionSettings() + .AppendTopics(topicReadSettings) + .MaxMemoryUsageBytes(BufferSize) + .ReadFromTimestamp(minTime); + if (Config.GetWithoutConsumer()) { + settings.WithoutConsumer(); + } else { + settings.ConsumerName(sourceParams.GetConsumerName()); + } + return settings; +} + +void TTopicSession::CreateTopicSession() { + if (Clients.empty()) { + return; + } + + // Use any sourceParams. + const NYql::NPq::NProto::TDqPqTopicSource& sourceParams = Clients.begin()->second.Settings.GetSource(); + + if (!ReadSession) { + InitParser(sourceParams); + ReadSession = GetTopicClient(sourceParams).CreateReadSession(GetReadSessionSettings(sourceParams)); + SubscribeOnNextEvent(); + } +} + +void TTopicSession::Handle(NFq::TEvPrivate::TEvPqEventsReady::TPtr&) { + LOG_ROW_DISPATCHER_TRACE("TEvPqEventsReady"); + Metrics.InFlySubscribe->Dec(); + IsWaitingEvents = false; + HandleNewEvents(); + SubscribeOnNextEvent(); +} + +void TTopicSession::Handle(NFq::TEvPrivate::TEvCreateSession::TPtr&) { + CreateTopicSession(); +} + +void TTopicSession::Handle(NFq::TEvPrivate::TEvDataParsed::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvDataParsed, offset " << ev->Get()->Offset); + + for (auto v: ev->Get()->Value) { + LOG_ROW_DISPATCHER_TRACE("v " << v); + } + + for (auto& [actorId, info] : Clients) { + try { + if (!info.Filter) { + continue; + } + info.Filter->Push(ev->Get()->Offset, ev->Get()->Value); + } catch (const std::exception& e) { + FatalError(e.what(), &info.Filter); + } + } + auto event = std::make_unique(ev->Get()->Offset); + Send(SelfId(), event.release()); +} + +void TTopicSession::Handle(NFq::TEvPrivate::TEvDataAfterFilteration::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvDataAfterFilteration, read actor id " << ev->Get()->ReadActorId.ToString()); + auto it = Clients.find(ev->Get()->ReadActorId); + if (it == Clients.end()) { + LOG_ROW_DISPATCHER_ERROR("Skip DataAfterFilteration, wrong read actor, id " << ev->Get()->ReadActorId.ToString()); + return; + } + AddDataToClient(it->second, ev->Get()->Offset, ev->Get()->Json); +} + +void TTopicSession::Handle(NFq::TEvPrivate::TEvStatus::TPtr&) { + LOG_ROW_DISPATCHER_TRACE("TEvStatus"); + Schedule(TDuration::Seconds(Config.GetSendStatusPeriodSec()), new NFq::TEvPrivate::TEvStatus()); + for (auto& [actorId, info] : Clients) { + if (!info.NextMessageOffset) { + continue; + } + if (*info.NextMessageOffset <= info.LastSendedNextMessageOffset) { + continue; + } + auto event = std::make_unique(); + event->Record.SetPartitionId(PartitionId); + event->Record.SetNextMessageOffset(*info.NextMessageOffset); + info.LastSendedNextMessageOffset = *info.NextMessageOffset; + event->ReadActorId = info.ReadActorId; + LOG_ROW_DISPATCHER_TRACE("Send status to " << info.ReadActorId << ", offset " << *info.NextMessageOffset); + Send(RowDispatcherActorId, event.release()); + } +} + +void TTopicSession::Handle(NFq::TEvPrivate::TEvDataFiltered::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvDataFiltered, offset " << ev->Get()->Offset); + for (auto& [actorId, info] : Clients) { + if (!info.NextMessageOffset + || *info.NextMessageOffset < ev->Get()->Offset + 1) { + info.NextMessageOffset = ev->Get()->Offset + 1; + } + } +} + +void TTopicSession::Handle(TEvRowDispatcher::TEvGetNextBatch::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvGetNextBatch from " << ev->Sender.ToString()); + Metrics.InFlyAsyncInputData->Set(0); + auto it = Clients.find(ev->Sender); + if (it == Clients.end()) { + LOG_ROW_DISPATCHER_ERROR("Wrong client, sender " << ev->Sender); + return; + } + SendData(it->second); + SubscribeOnNextEvent(); +} + +void TTopicSession::HandleNewEvents() { + while (true) { + if (!ReadSession) { + return; + } + if (Config.GetMaxSessionUsedMemory() && UsedSize > Config.GetMaxSessionUsedMemory()) { + LOG_ROW_DISPATCHER_TRACE("Too much used memory (" << UsedSize << " bytes), stop reading from yds"); + break; + } + TMaybe event = ReadSession->GetEvent(false); + if (!event) { + break; + } + std::visit(TTopicEventProcessor{*this, LogPrefix}, *event); + } +} + +void TTopicSession::CloseTopicSession() { + if (!ReadSession) { + return; + } + LOG_ROW_DISPATCHER_DEBUG("Close session"); + ReadSession->Close(TDuration::Zero()); + ReadSession.reset(); +} + +void TTopicSession::TTopicEventProcessor::operator()(NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent& event) { + Self.Metrics.RowsRead->Add(event.GetMessages().size()); + for (const auto& message : event.GetMessages()) { + const TString& data = message.GetData(); + Self.IngressStats.Bytes += data.size(); + LOG_ROW_DISPATCHER_TRACE("Data received: " << message.DebugString(true)); + + TString item = message.GetData(); + item.Detach(); + Self.SendToParsing(message.GetOffset(), item); + Self.LastMessageOffset = message.GetOffset(); + } +} + +void TTopicSession::TTopicEventProcessor::operator()(NYdb::NTopic::TSessionClosedEvent& ev) { + TString message = TStringBuilder() << "Read session to topic \"" << Self.TopicPath << "\" was closed: " << ev.DebugString(); + LOG_ROW_DISPATCHER_DEBUG(message); + NYql::TIssues issues; + issues.AddIssue(message); + Self.FatalError(issues.ToOneLineString()); +} + +void TTopicSession::TTopicEventProcessor::operator()(NYdb::NTopic::TReadSessionEvent::TStartPartitionSessionEvent& event) { + LOG_ROW_DISPATCHER_DEBUG("StartPartitionSessionEvent received"); + + TMaybe minOffset; + for (const auto& [actorId, info] : Self.Clients) { + if (!minOffset + || (info.NextMessageOffset && (info.NextMessageOffset < *minOffset))) { + minOffset = info.NextMessageOffset; + } + } + LOG_ROW_DISPATCHER_DEBUG("Confirm StartPartitionSession with offset " << minOffset); + event.Confirm(minOffset); +} + +void TTopicSession::TTopicEventProcessor::operator()(NYdb::NTopic::TReadSessionEvent::TStopPartitionSessionEvent& event) { + LOG_ROW_DISPATCHER_DEBUG("SessionId: " << Self.GetSessionId() << " StopPartitionSessionEvent received"); + event.Confirm(); +} + +void TTopicSession::TTopicEventProcessor::operator()(NYdb::NTopic::TReadSessionEvent::TEndPartitionSessionEvent& /*event*/) { + LOG_ROW_DISPATCHER_WARN("TEndPartitionSessionEvent"); +} + +void TTopicSession::TTopicEventProcessor::operator()(NYdb::NTopic::TReadSessionEvent::TPartitionSessionClosedEvent& /*event*/) { + LOG_ROW_DISPATCHER_WARN("TPartitionSessionClosedEvent"); +} + +std::pair TTopicSession::CreateItem(const NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage& message) { + const TString& data = message.GetData(); + i64 usedSpace = data.Size(); + NYql::NUdf::TUnboxedValuePod item = NKikimr::NMiniKQL::MakeString(NYql::NUdf::TStringRef(data.Data(), data.Size())); + return std::make_pair(item, usedSpace); +} + +TString TTopicSession::GetSessionId() const { + return ReadSession ? ReadSession->GetSessionId() : TString{"empty"}; +} + +void TTopicSession::SendToParsing(ui64 offset, const TString& message) { + LOG_ROW_DISPATCHER_TRACE("SendToParsing, message " << message); + + for (auto& readActorId : ClientsWithoutPredicate) { + auto it = Clients.find(readActorId); + Y_ENSURE(it != Clients.end(), "Internal error: unknown client"); + auto& info = it->second; + if (!info.Filter) { + LOG_ROW_DISPATCHER_TRACE("Send message to client without parsing/filtering"); + AddDataToClient(info, offset, message); + } + } + + try { + Parser->Push(offset, message); + } catch (const std::exception& e) { + FatalError(e.what()); + } +} + +void TTopicSession::SendData(ClientsInfo& info) { + info.DataArrivedSent = false; + if (info.Buffer.empty()) { + LOG_ROW_DISPATCHER_TRACE("Buffer empty"); + } + + do { + auto event = std::make_unique(); + event->Record.SetPartitionId(PartitionId); + Y_ENSURE(info.NextMessageOffset); + event->ReadActorId = info.ReadActorId; + + ui64 batchSize = 0; + while (!info.Buffer.empty()) { + const auto& [offset, json] = info.Buffer.front(); + info.UsedSize -= json.size(); + UsedSize -= json.size(); + batchSize += json.size(); + NFq::NRowDispatcherProto::TEvMessage message; + message.SetJson(json); + message.SetOffset(offset); + event->Record.AddMessages()->CopyFrom(message); + event->Record.SetNextMessageOffset(offset + 1); + info.Buffer.pop(); + + if (batchSize > MaxBatchSizeBytes) { + break; + } + } + if (info.Buffer.empty()) { + event->Record.SetNextMessageOffset(*info.NextMessageOffset); + } + LOG_ROW_DISPATCHER_TRACE("SendData to " << info.ReadActorId << ", batch size " << event->Record.MessagesSize()); + Send(RowDispatcherActorId, event.release()); + } while(!info.Buffer.empty()); + info.LastSendedNextMessageOffset = *info.NextMessageOffset; +} + +void TTopicSession::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) { + auto it = Clients.find(ev->Sender); + if (it != Clients.end()) { + FatalError("Internal error: sender " + ev->Sender.ToString()); + return; + } + + LOG_ROW_DISPATCHER_INFO("New client, read actor id " << ev->Sender.ToString()); + + auto columns = GetVector(ev->Get()->Record.GetSource().GetColumns()); + auto types = GetVector(ev->Get()->Record.GetSource().GetColumnTypes()); + auto parserType = std::make_pair(columns, types); + if (CurrentParserTypes && *CurrentParserTypes != parserType) { + SendSessionError(ev->Sender, "Different columns/types, use same in all queries"); + return; + } + + try { + auto& clientInfo = Clients.emplace( + std::piecewise_construct, + std::forward_as_tuple(ev->Sender), + std::forward_as_tuple(ev)).first->second; + + TString predicate = clientInfo.Settings.GetSource().GetPredicate(); + if (!predicate.empty()) { + clientInfo.Filter = NewJsonFilter( + columns, + types, + predicate, + [&, actorId = clientInfo.ReadActorId](ui64 offset, const TString& json){ + Send(SelfId(), new NFq::TEvPrivate::TEvDataAfterFilteration(offset, json, actorId)); + }); + } else { + ClientsWithoutPredicate.insert(ev->Sender); + } + + LOG_ROW_DISPATCHER_INFO("New client: offset " << clientInfo.NextMessageOffset << ", predicate: " << clientInfo.Settings.GetSource().GetPredicate()); + + if (ReadSession) { + if (clientInfo.Settings.HasOffset() && (clientInfo.Settings.GetOffset() <= LastMessageOffset)) { + LOG_ROW_DISPATCHER_INFO("New client has less offset than the last message, stop (restart) topic session"); + StopReadSession(); + } + } + } catch (const NYql::NPureCalc::TCompileError& e) { + FatalError("Adding new client failed: CompileError: sql: " + e.GetYql() + ", error: " + e.GetIssues()); + } catch (const yexception &ex) { + FatalError(TString{"Adding new client failed: "} + ex.what()); + } catch (...) { + FatalError("Adding new client failed, " + CurrentExceptionMessage()); + } + + PrintInternalState(); + if (!ReadSession) { + Schedule(TDuration::Seconds(Config.GetTimeoutBeforeStartSessionSec()), new NFq::TEvPrivate::TEvCreateSession()); + } +} + +void TTopicSession::AddDataToClient(ClientsInfo& info, ui64 offset, const TString& json) { + if (info.NextMessageOffset && offset < info.NextMessageOffset) { + return; + } + info.NextMessageOffset = offset + 1; + info.Buffer.push(std::make_pair(offset, json)); + info.UsedSize += json.size(); + UsedSize += json.size(); + SendDataArrived(info); +} + +void TTopicSession::Handle(NFq::TEvRowDispatcher::TEvStopSession::TPtr& ev) { + LOG_ROW_DISPATCHER_DEBUG("TEvStopSession, topicPath " << ev->Get()->Record.GetSource().GetTopicPath() << + " partitionId " << ev->Get()->Record.GetPartitionId()); + + auto it = Clients.find(ev->Sender); + if (it == Clients.end()) { + LOG_ROW_DISPATCHER_DEBUG("Wrong ClientSettings"); // TODO + return; + } + Clients.erase(it); + ClientsWithoutPredicate.erase(ev->Sender); +} + +void TTopicSession::InitParser(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) { + if (Parser) { + return; + } + try { + CurrentParserTypes = std::make_pair(GetVector(sourceParams.GetColumns()), GetVector(sourceParams.GetColumnTypes())); + NActors::TActorSystem* actorSystem = NActors::TActivationContext::ActorSystem(); + Parser = NewJsonParser( + GetVector(sourceParams.GetColumns()), + GetVector(sourceParams.GetColumnTypes()), + [actorSystem, selfId = SelfId()](ui64 offset, TList&& value){ + actorSystem->Send(selfId, new NFq::TEvPrivate::TEvDataParsed(offset, std::move(value))); + }); + } catch (const NYql::NPureCalc::TCompileError& e) { + FatalError(e.GetIssues()); + } +} + +void TTopicSession::FatalError(const TString& message, const std::unique_ptr* filter) { + TStringStream str; + str << message; + if (Parser) { + str << ", parser sql: " << Parser->GetSql(); + } + if (filter) { + str << ", filter sql:" << (*filter)->GetSql(); + } + LOG_ROW_DISPATCHER_ERROR("FatalError: " << str.Str()); + + for (auto& [readActorId, info] : Clients) { + LOG_ROW_DISPATCHER_DEBUG("Send TEvSessionError to " << readActorId); + SendSessionError(readActorId, str.Str()); + } + StopReadSession(); + Become(&TTopicSession::ErrorState); + ythrow yexception() << "FatalError: " << str.Str(); // To exit from current stack and call once PassAway() in HandleException(). +} + +void TTopicSession::SendSessionError(NActors::TActorId readActorId, const TString& message) { + auto event = std::make_unique(); + event->Record.SetMessage(message); + event->Record.SetPartitionId(PartitionId); + event->ReadActorId = readActorId; + Send(RowDispatcherActorId, event.release()); +} + +void TTopicSession::StopReadSession() { + if (ReadSession) { + LOG_ROW_DISPATCHER_DEBUG("Close read session"); + ReadSession->Close(TDuration::Zero()); + ReadSession.reset(); + } + TopicClient.reset(); +} + +void TTopicSession::SendDataArrived(ClientsInfo& info) { + if (info.Buffer.empty() || info.DataArrivedSent) { + return; + } + info.DataArrivedSent = true; + LOG_ROW_DISPATCHER_TRACE("Send TEvNewDataArrived to " << info.ReadActorId); + Metrics.InFlyAsyncInputData->Set(1); + auto event = std::make_unique(); + event->Record.SetPartitionId(PartitionId); + event->ReadActorId = info.ReadActorId; + Send(RowDispatcherActorId, event.release()); +} + +void TTopicSession::HandleException(const std::exception& e) { + if (CurrentStateFunc() == &TThis::ErrorState) { + return; + } + FatalError(TString("Internal error: exception: ") + e.what()); +} + +void TTopicSession::PrintInternalState() { + TStringStream str; + str << "Clients:\n"; + str << "UsedSize: " << UsedSize << "\n"; + for (auto& [readActorId, info] : Clients) { + str << " read actor id " << readActorId << ", buffer size " << info.Buffer.size() + << ", used size: " << info.UsedSize << ", data arrived sent " << info.DataArrivedSent + << ", next offset " << info.NextMessageOffset << "\n"; + } + LOG_ROW_DISPATCHER_DEBUG(str.Str()); +} + +void TTopicSession::Handle(NFq::TEvPrivate::TEvPrintState::TPtr&) { + Schedule(TDuration::Seconds(PrintStatePeriodSec), new NFq::TEvPrivate::TEvPrintState()); + PrintInternalState(); +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr NewTopicSession( + const TString& topicPath, + const NConfig::TRowDispatcherConfig& config, + NActors::TActorId rowDispatcherActorId, + ui32 partitionId, + NYdb::TDriver driver, + std::shared_ptr credentialsProviderFactory, + const ::NMonitoring::TDynamicCounterPtr& counters) { + return std::unique_ptr(new TTopicSession(topicPath, config, rowDispatcherActorId, partitionId, std::move(driver), credentialsProviderFactory, counters)); +} + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/topic_session.h b/ydb/core/fq/libs/row_dispatcher/topic_session.h new file mode 100644 index 000000000000..b3980cce8269 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/topic_session.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include + +#include + +namespace NFq { + +std::unique_ptr NewTopicSession( + const TString& topicPath, + const NConfig::TRowDispatcherConfig& config, + NActors::TActorId rowDispatcherActorId, + ui32 partitionId, + NYdb::TDriver driver, + std::shared_ptr credentialsProviderFactory, + const ::NMonitoring::TDynamicCounterPtr& counters); + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/ut/coordinator_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/coordinator_ut.cpp new file mode 100644 index 000000000000..478326acf53c --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/ut/coordinator_ut.cpp @@ -0,0 +1,166 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace { + +using namespace NKikimr; +using namespace NFq; + +class TFixture : public NUnitTest::TBaseFixture { + +public: + TFixture() + : Runtime(4) {} + + void SetUp(NUnitTest::TTestContext&) override { + TAutoPtr app = new TAppPrepare(); + Runtime.Initialize(app->Unwrap()); + Runtime.SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_TRACE); + auto credFactory = NKikimr::CreateYdbCredentialsProviderFactory; + auto yqSharedResources = NFq::TYqSharedResources::Cast(NFq::CreateYqSharedResourcesImpl({}, credFactory, MakeIntrusive())); + + LocalRowDispatcherId = Runtime.AllocateEdgeActor(0); + RowDispatcher1Id = Runtime.AllocateEdgeActor(1); + RowDispatcher2Id = Runtime.AllocateEdgeActor(2); + ReadActor1 = Runtime.AllocateEdgeActor(0); + ReadActor2 = Runtime.AllocateEdgeActor(0); + + NConfig::TRowDispatcherCoordinatorConfig config; + config.SetCoordinationNodePath("RowDispatcher"); + auto& database = *config.MutableDatabase(); + database.SetEndpoint("YDB_ENDPOINT"); + database.SetDatabase("YDB_DATABASE"); + database.SetToken(""); + + Coordinator = Runtime.Register(NewCoordinator( + LocalRowDispatcherId, + config, + yqSharedResources, + "Tenant", + MakeIntrusive() + ).release()); + + Runtime.EnableScheduleForActor(Coordinator); + + TDispatchOptions options; + options.FinalEvents.emplace_back(NActors::TEvents::TSystem::Bootstrap, 1); + Runtime.DispatchEvents(options); + } + + void TearDown(NUnitTest::TTestContext& /* context */) override { + } + + NYql::NPq::NProto::TDqPqTopicSource BuildPqTopicSourceSettings( + TString topic) + { + NYql::NPq::NProto::TDqPqTopicSource settings; + settings.SetTopicPath(topic); + settings.SetConsumerName("PqConsumer"); + settings.SetEndpoint("Endpoint"); + settings.MutableToken()->SetName("token"); + settings.SetDatabase("Database"); + return settings; + } + + void ExpectCoordinatorChangesSubscribe() { + auto eventHolder = Runtime.GrabEdgeEvent(LocalRowDispatcherId, TDuration::Seconds(5)); + UNIT_ASSERT(eventHolder.Get() != nullptr); + } + + void Ping(NActors::TActorId rowDispatcherId) { + auto event = new NActors::TEvents::TEvPing(); + Runtime.Send(new NActors::IEventHandle(Coordinator, rowDispatcherId, event)); + + // TODO: GrabEdgeEvent is not working with events on other nodes ?! + //auto eventHolder = Runtime.GrabEdgeEvent(rowDispatcherId, TDuration::Seconds(5)); + //UNIT_ASSERT(eventHolder.Get() != nullptr); + } + + void MockRequest(NActors::TActorId readActorId, TString topicName, const std::vector& partitionId) { + auto event = new NFq::TEvRowDispatcher::TEvCoordinatorRequest( + BuildPqTopicSourceSettings(topicName), + partitionId); + Runtime.Send(new NActors::IEventHandle(Coordinator, readActorId, event)); + } + + NFq::NRowDispatcherProto::TEvGetAddressResponse ExpectResult(NActors::TActorId readActorId) { + auto eventPtr = Runtime.GrabEdgeEvent(readActorId, TDuration::Seconds(5)); + UNIT_ASSERT(eventPtr.Get() != nullptr); + NFq::NRowDispatcherProto::TEvGetAddressResponse result; + result.CopyFrom(eventPtr->Get()->Record); + return result; + } + + TActorSystemStub actorSystemStub; + NActors::TTestActorRuntime Runtime; + NActors::TActorId Coordinator; + NActors::TActorId LocalRowDispatcherId; + NActors::TActorId RowDispatcher1Id; + NActors::TActorId RowDispatcher2Id; + NActors::TActorId ReadActor1; + NActors::TActorId ReadActor2; + + NYql::NPq::NProto::TDqPqTopicSource Source1 = BuildPqTopicSourceSettings("Source1"); +}; + +Y_UNIT_TEST_SUITE(CoordinatorTests) { + Y_UNIT_TEST_F(Route, TFixture) { + + ExpectCoordinatorChangesSubscribe(); + + TSet rowDispatcherIds{RowDispatcher1Id, RowDispatcher2Id, LocalRowDispatcherId}; + for (auto id : rowDispatcherIds) { + Ping(id); + } + + MockRequest(ReadActor1, "topic1", {0}); + auto result1 = ExpectResult(ReadActor1); + + MockRequest(ReadActor2, "topic1", {0}); + auto result2 = ExpectResult(ReadActor2); + + UNIT_ASSERT(result1.PartitionsSize() == 1); + UNIT_ASSERT(result2.PartitionsSize() == 1); + UNIT_ASSERT(google::protobuf::util::MessageDifferencer::Equals(result1, result2)); + + MockRequest(ReadActor2, "topic1", {1}); + auto result3 = ExpectResult(ReadActor2); + + TActorId actualRowDispatcher1 = ActorIdFromProto(result1.GetPartitions(0).GetActorId()); + TActorId actualRowDispatcher2 = ActorIdFromProto(result2.GetPartitions(0).GetActorId()); + TActorId actualRowDispatcher3 = ActorIdFromProto(result3.GetPartitions(0).GetActorId()); + + UNIT_ASSERT(rowDispatcherIds.contains(actualRowDispatcher1)); + UNIT_ASSERT(rowDispatcherIds.contains(actualRowDispatcher2)); + UNIT_ASSERT(rowDispatcherIds.contains(actualRowDispatcher3)); + UNIT_ASSERT(actualRowDispatcher1 != actualRowDispatcher3); + + // RowDispatchers is restarted. + // Skip Disconnected/Coonnected in test. + auto newDispatcher1Id = Runtime.AllocateEdgeActor(1); + Ping(newDispatcher1Id); + + auto newDispatcher2Id = Runtime.AllocateEdgeActor(1); + Ping(newDispatcher2Id); + + MockRequest(ReadActor1, "topic1", {0}); + auto result4 = ExpectResult(ReadActor1); + + MockRequest(ReadActor2, "topic1", {1}); + auto result5 = ExpectResult(ReadActor2); + + UNIT_ASSERT(!google::protobuf::util::MessageDifferencer::Equals(result1, result4) + || !google::protobuf::util::MessageDifferencer::Equals(result3, result5)); + } +} + +} + diff --git a/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp new file mode 100644 index 000000000000..1645f521051d --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp @@ -0,0 +1,91 @@ +#include +#include + +#include + +#include +#include +#include +#include + +namespace { + +using namespace NKikimr; +using namespace NFq; + +class TFixture : public NUnitTest::TBaseFixture { + +public: + TFixture() + : Runtime(true) {} + + void SetUp(NUnitTest::TTestContext&) override { + TAutoPtr app = new TAppPrepare(); + Runtime.Initialize(app->Unwrap()); + Runtime.SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_DEBUG); + } + + void TearDown(NUnitTest::TTestContext& /* context */) override { + Filter.reset(); + } + + void MakeFilter( + const TVector& columns, + const TVector& types, + const TString& whereFilter, + NFq::TJsonFilter::TCallback callback) { + Filter = NFq::NewJsonFilter( + columns, + types, + whereFilter, + callback); + } + + TActorSystemStub actorSystemStub; + NActors::TTestActorRuntime Runtime; + std::unique_ptr Filter; +}; + +Y_UNIT_TEST_SUITE(TJsonFilterTests) { + Y_UNIT_TEST_F(Simple1, TFixture) { + TMap result; + MakeFilter( + {"a1", "a2"}, + {"String", "UInt64"}, + "where a2 > 100", + [&](ui64 offset, const TString& json) { + result[offset] = json; + }); + Filter->Push(5, {"hello1", "99"}); + Filter->Push(6, {"hello2", "101"}); + UNIT_ASSERT_VALUES_EQUAL(1, result.size()); + UNIT_ASSERT_VALUES_EQUAL(R"({"a1":"hello2","a2":101})", result[6]); + } + + Y_UNIT_TEST_F(Simple2, TFixture) { + TMap result; + MakeFilter( + {"a2", "a1"}, + {"UInt64", "String"}, + "where a2 > 100", + [&](ui64 offset, const TString& json) { + result[offset] = json; + }); + Filter->Push(5, {"99", "hello1"}); + Filter->Push(6, {"101", "hello2"}); + UNIT_ASSERT_VALUES_EQUAL(1, result.size()); + UNIT_ASSERT_VALUES_EQUAL(R"({"a1":"hello2","a2":101})", result[6]); + } + + Y_UNIT_TEST_F(ThrowExceptionByError, TFixture) { + MakeFilter( + {"a1", "a2"}, + {"String", "UInt64"}, + "where Unwrap(a2) = 1", + [&](ui64, const TString&) { }); + UNIT_ASSERT_EXCEPTION_CONTAINS(Filter->Push(5, {"99", "hello1"}), yexception, "Failed to unwrap empty optional"); + } +} + +} + diff --git a/ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp new file mode 100644 index 000000000000..a9c389d3900f --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp @@ -0,0 +1,122 @@ +#include +#include + +#include + +#include +#include +#include + +#include + +#include + +namespace { + +using namespace NKikimr; +using namespace NFq; + +class TFixture : public NUnitTest::TBaseFixture { + +public: + TFixture() + : Runtime(true) {} + + void SetUp(NUnitTest::TTestContext&) override { + TAutoPtr app = new TAppPrepare(); + Runtime.Initialize(app->Unwrap()); + Runtime.SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_DEBUG); + } + + void TearDown(NUnitTest::TTestContext& /* context */) override { + if (Parser) { + Parser.reset(); + } + } + + void MakeParser(TVector columns, TVector types, NFq::TJsonParser::TCallback callback) { + try { + Parser = NFq::NewJsonParser( + columns, + types, + callback); + } catch (NYql::NPureCalc::TCompileError compileError) { + UNIT_ASSERT_C(false, TStringBuilder() << "Failed to create json parser: " << compileError.what() << "\nQuery text:\n" << compileError.GetYql() << "Reason:\n" << compileError.GetIssues()); + } + } + + void MakeParser(TVector columns, NFq::TJsonParser::TCallback callback) { + MakeParser(columns, TVector(columns.size(), "String"), callback); + } + + TActorSystemStub actorSystemStub; + NActors::TTestActorRuntime Runtime; + std::unique_ptr Parser; +}; + +Y_UNIT_TEST_SUITE(TJsonParserTests) { + Y_UNIT_TEST_F(Simple1, TFixture) { + TList result; + ui64 resultOffset; + MakeParser({"a1", "a2"}, {"String", "Optional"}, [&](ui64 offset, TList&& value){ + resultOffset = offset; + result = std::move(value); + }); + Parser->Push(5, R"({"a1": "hello1", "a2": 101, "event": "event1"})"); + UNIT_ASSERT_VALUES_EQUAL(5, resultOffset); + UNIT_ASSERT_VALUES_EQUAL(2, result.size()); + UNIT_ASSERT_VALUES_EQUAL("hello1", result.front()); + UNIT_ASSERT_VALUES_EQUAL("101", result.back()); + } + + Y_UNIT_TEST_F(Simple2, TFixture) { + TList result; + ui64 resultOffset; + MakeParser({"a2", "a1"}, [&](ui64 offset, TList&& value){ + resultOffset = offset; + result = std::move(value); + }); + Parser->Push(5, R"({"a1": "hello1", "a2": "101", "event": "event1"})"); + UNIT_ASSERT_VALUES_EQUAL(5, resultOffset); + UNIT_ASSERT_VALUES_EQUAL(2, result.size()); + UNIT_ASSERT_VALUES_EQUAL("101", result.front()); + UNIT_ASSERT_VALUES_EQUAL("hello1", result.back()); + } + + Y_UNIT_TEST_F(Simple3, TFixture) { + TList result; + ui64 resultOffset; + MakeParser({"a1", "a2"}, [&](ui64 offset, TList&& value){ + resultOffset = offset; + result = std::move(value); + }); + Parser->Push(5, R"({"a2": "hello1", "a1": "101", "event": "event1"})"); + UNIT_ASSERT_VALUES_EQUAL(5, resultOffset); + UNIT_ASSERT_VALUES_EQUAL(2, result.size()); + UNIT_ASSERT_VALUES_EQUAL("101", result.front()); + UNIT_ASSERT_VALUES_EQUAL("hello1", result.back()); + } + + Y_UNIT_TEST_F(Simple4, TFixture) { + TList result; + ui64 resultOffset; + MakeParser({"a2", "a1"}, [&](ui64 offset, TList&& value){ + resultOffset = offset; + result = std::move(value); + }); + Parser->Push(5, R"({"a2": "hello1", "a1": "101", "event": "event1"})"); + UNIT_ASSERT_VALUES_EQUAL(5, resultOffset); + UNIT_ASSERT_VALUES_EQUAL(2, result.size()); + UNIT_ASSERT_VALUES_EQUAL("hello1", result.front()); + UNIT_ASSERT_VALUES_EQUAL("101", result.back()); + } + + Y_UNIT_TEST_F(ThrowExceptionByError, TFixture) { + + MakeParser({"a2", "a1"}, [&](ui64, TList&&){ }); + UNIT_ASSERT_EXCEPTION_CONTAINS(Parser->Push(5, R"(ydb)"), yexception, "DB::ParsingException: Cannot parse input: expected '{' before: 'ydb': (at row 1)"); + } +} + +} + diff --git a/ydb/core/fq/libs/row_dispatcher/ut/leader_election_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/leader_election_ut.cpp new file mode 100644 index 000000000000..93ccaa8c151e --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/ut/leader_election_ut.cpp @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +using namespace NKikimr; +using namespace NFq; + +class TFixture : public NUnitTest::TBaseFixture { + +public: + TFixture() + : Runtime(1, false) {} + + void SetUp(NUnitTest::TTestContext&) override { + TAutoPtr app = new TAppPrepare(); + Runtime.Initialize(app->Unwrap()); + Runtime.SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_DEBUG); + auto credFactory = NKikimr::CreateYdbCredentialsProviderFactory; + auto yqSharedResources = NFq::TYqSharedResources::Cast(NFq::CreateYqSharedResourcesImpl({}, credFactory, MakeIntrusive())); + + RowDispatcher = Runtime.AllocateEdgeActor(); + Coordinator1 = Runtime.AllocateEdgeActor(); + Coordinator2 = Runtime.AllocateEdgeActor(); + Coordinator3 = Runtime.AllocateEdgeActor(); + + NConfig::TRowDispatcherCoordinatorConfig config; + config.SetCoordinationNodePath("row_dispatcher"); + auto& database = *config.MutableDatabase(); + database.SetEndpoint(GetEnv("YDB_ENDPOINT")); + database.SetDatabase(GetEnv("YDB_DATABASE")); + database.SetToken(""); + + LeaderElection1 = Runtime.Register(NewLeaderElection( + RowDispatcher, + Coordinator1, + config, + NKikimr::CreateYdbCredentialsProviderFactory, + yqSharedResources, + "/tenant", + MakeIntrusive() + ).release()); + + LeaderElection2 = Runtime.Register(NewLeaderElection( + RowDispatcher, + Coordinator2, + config, + NKikimr::CreateYdbCredentialsProviderFactory, + yqSharedResources, + "/tenant", + MakeIntrusive() + ).release()); + + LeaderElection3 = Runtime.Register(NewLeaderElection( + RowDispatcher, + Coordinator3, + config, + NKikimr::CreateYdbCredentialsProviderFactory, + yqSharedResources, + "/tenant", + MakeIntrusive() + ).release()); + + Runtime.EnableScheduleForActor(LeaderElection1); + Runtime.EnableScheduleForActor(LeaderElection2); + Runtime.EnableScheduleForActor(LeaderElection3); + + TDispatchOptions options; + options.FinalEvents.emplace_back(NActors::TEvents::TSystem::Bootstrap, 3); + Runtime.DispatchEvents(options); + } + + void TearDown(NUnitTest::TTestContext& /* context */) override { + } + + NActors::TActorId ExpectCoordinatorChanged() { + auto eventHolder = Runtime.GrabEdgeEvent(RowDispatcher); + UNIT_ASSERT(eventHolder.Get() != nullptr); + return eventHolder.Get()->Get()->CoordinatorActorId; + } + + TActorSystemStub actorSystemStub; + NActors::TTestActorRuntime Runtime; + NActors::TActorId RowDispatcher; + NActors::TActorId LeaderElection1; + NActors::TActorId LeaderElection2; + NActors::TActorId LeaderElection3; + NActors::TActorId Coordinator1; + NActors::TActorId Coordinator2; + NActors::TActorId Coordinator3; + NActors::TActorId LeaderDetector; +}; + +Y_UNIT_TEST_SUITE(LeaderElectionTests) { + Y_UNIT_TEST_F(Test1, TFixture) { + + auto coordinatorId1 = ExpectCoordinatorChanged(); + auto coordinatorId2 = ExpectCoordinatorChanged(); + auto coordinatorId3 = ExpectCoordinatorChanged(); + UNIT_ASSERT(coordinatorId1 == coordinatorId2); + UNIT_ASSERT(coordinatorId2 == coordinatorId3); + + NActors::TActorId currentLeader; + NActors::TActorId notActive; + if (coordinatorId1 == Coordinator1) { + currentLeader = LeaderElection1; + } else if (coordinatorId1 == Coordinator2) { + currentLeader = LeaderElection2; + } else { + currentLeader = LeaderElection3; + } + + Runtime.Send(new IEventHandle(currentLeader, RowDispatcher, new NActors::TEvents::TEvPoisonPill())); + auto coordinatorId4 = ExpectCoordinatorChanged(); + auto coordinatorId5 = ExpectCoordinatorChanged(); + UNIT_ASSERT(coordinatorId4 == coordinatorId5); + UNIT_ASSERT(coordinatorId4 != coordinatorId1); + + if (coordinatorId4 == Coordinator1) { + currentLeader = LeaderElection1; + } else if (coordinatorId4 == Coordinator2) { + currentLeader = LeaderElection2; + } else { + currentLeader = LeaderElection3; + } + + Runtime.Send(new IEventHandle(currentLeader, RowDispatcher, new NActors::TEvents::TEvPoisonPill())); + auto coordinatorId6 = ExpectCoordinatorChanged(); + UNIT_ASSERT(coordinatorId6 != coordinatorId4); + } +} + +} + diff --git a/ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp new file mode 100644 index 000000000000..f5641e815539 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp @@ -0,0 +1,342 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +using namespace NKikimr; +using namespace NFq; + +struct TTestActorFactory : public NFq::NRowDispatcher::IActorFactory { + TTestActorFactory(NActors::TTestActorRuntime& runtime) + : Runtime(runtime) + {} + + NActors::TActorId PopActorId() { + UNIT_ASSERT(!ActorIds.empty()); + auto result = ActorIds.front(); + ActorIds.pop(); + return result; + } + + NActors::TActorId RegisterTopicSession( + const TString& /*topicPath*/, + const NConfig::TRowDispatcherConfig& /*config*/, + NActors::TActorId /*rowDispatcherActorId*/, + ui32 /*partitionId*/, + NYdb::TDriver /*driver*/, + std::shared_ptr /*credentialsProviderFactory*/, + const ::NMonitoring::TDynamicCounterPtr& /*counters*/) const override { + auto actorId = Runtime.AllocateEdgeActor(); + ActorIds.push(actorId); + return actorId; + } + + NActors::TTestActorRuntime& Runtime; + mutable TQueue ActorIds; +}; + +class TFixture : public NUnitTest::TBaseFixture { + +public: + TFixture() + : Runtime(1) {} + + void SetUp(NUnitTest::TTestContext&) override { + TAutoPtr app = new TAppPrepare(); + Runtime.Initialize(app->Unwrap()); + Runtime.SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_TRACE); + NConfig::TRowDispatcherConfig config; + config.SetEnabled(true); + NConfig::TRowDispatcherCoordinatorConfig& coordinatorConfig = *config.MutableCoordinator(); + coordinatorConfig.SetCoordinationNodePath("RowDispatcher"); + auto& database = *coordinatorConfig.MutableDatabase(); + database.SetEndpoint("YDB_ENDPOINT"); + database.SetDatabase("YDB_DATABASE"); + database.SetToken(""); + + NConfig::TCommonConfig commonConfig; + auto credFactory = NKikimr::CreateYdbCredentialsProviderFactory; + auto yqSharedResources = NFq::TYqSharedResources::Cast(NFq::CreateYqSharedResourcesImpl({}, credFactory, MakeIntrusive())); + + NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory; + Coordinator = Runtime.AllocateEdgeActor(); + EdgeActor = Runtime.AllocateEdgeActor(); + ReadActorId1 = Runtime.AllocateEdgeActor(); + ReadActorId2 = Runtime.AllocateEdgeActor(); + TestActorFactory = MakeIntrusive(Runtime); + + RowDispatcher = Runtime.Register(NewRowDispatcher( + config, + commonConfig, + NKikimr::CreateYdbCredentialsProviderFactory, + yqSharedResources, + credentialsFactory, + "Tenant", + TestActorFactory, + MakeIntrusive() + ).release()); + + Runtime.EnableScheduleForActor(RowDispatcher); + + TDispatchOptions options; + options.FinalEvents.emplace_back(NActors::TEvents::TSystem::Bootstrap, 1); + Runtime.DispatchEvents(options); + } + + void TearDown(NUnitTest::TTestContext& /* context */) override { + } + + NYql::NPq::NProto::TDqPqTopicSource BuildPqTopicSourceSettings( + TString endpoint, + TString database, + TString topic) + { + NYql::NPq::NProto::TDqPqTopicSource settings; + settings.SetTopicPath(topic); + settings.SetConsumerName("PqConsumer"); + settings.SetEndpoint(endpoint); + settings.MutableToken()->SetName("token"); + settings.SetDatabase(database); + return settings; + } + + void MockAddSession(const NYql::NPq::NProto::TDqPqTopicSource& source, ui64 partitionId, TActorId readActorId) { + auto event = new NFq::TEvRowDispatcher::TEvStartSession( + source, + partitionId, // partitionId + "Token", + Nothing(), // readOffset, + 0, // StartingMessageTimestamp; + "QueryId"); + Runtime.Send(new IEventHandle(RowDispatcher, readActorId, event)); + } + + void MockStopSession(const NYql::NPq::NProto::TDqPqTopicSource& source, ui64 partitionId, TActorId readActorId) { + auto event = std::make_unique(); + event->Record.MutableSource()->CopyFrom(source); + event->Record.SetPartitionId(partitionId); + Runtime.Send(new IEventHandle(RowDispatcher, readActorId, event.release())); + } + + void MockNewDataArrived(ui64 partitionId, TActorId topicSessionId, TActorId readActorId) { + auto event = std::make_unique(); + event->Record.SetPartitionId(partitionId); + event->ReadActorId = readActorId; + Runtime.Send(new IEventHandle(RowDispatcher, topicSessionId, event.release())); + } + + void MockMessageBatch(ui64 partitionId, TActorId topicSessionId, TActorId readActorId) { + auto event = std::make_unique(); + event->Record.SetPartitionId(partitionId); + event->ReadActorId = readActorId; + Runtime.Send(new IEventHandle(RowDispatcher, topicSessionId, event.release())); + } + + void MockSessionError(ui64 partitionId, TActorId topicSessionId, TActorId readActorId) { + auto event = std::make_unique(); + event->Record.SetPartitionId(partitionId); + event->ReadActorId = readActorId; + Runtime.Send(new IEventHandle(RowDispatcher, topicSessionId, event.release())); + } + + void MockGetNextBatch(ui64 partitionId, TActorId readActorId) { + auto event = std::make_unique(); + event->Record.SetPartitionId(partitionId); + Runtime.Send(new IEventHandle(RowDispatcher, readActorId, event.release())); + } + + void ExpectStartSession(NActors::TActorId actorId) { + auto eventHolder = Runtime.GrabEdgeEvent(actorId); + UNIT_ASSERT(eventHolder.Get() != nullptr); + } + + void ExpectStopSession(NActors::TActorId actorId, ui64 partitionId) { + auto eventHolder = Runtime.GrabEdgeEvent(actorId); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->Record.GetPartitionId() == partitionId); + } + + void ExpectGetNextBatch(NActors::TActorId topicSessionId, ui64 partitionId) { + auto eventHolder = Runtime.GrabEdgeEvent(topicSessionId); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->Record.GetPartitionId() == partitionId); + } + + void ExpectNewDataArrived(NActors::TActorId readActorId, ui64 partitionId) { + auto eventHolder = Runtime.GrabEdgeEvent(readActorId); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->Record.GetPartitionId() == partitionId); + } + + void ExpectStartSessionAck(NActors::TActorId readActorId) { + auto eventHolder = Runtime.GrabEdgeEvent(readActorId); + UNIT_ASSERT(eventHolder.Get() != nullptr); + } + + void ExpectMessageBatch(NActors::TActorId readActorId) { + auto eventHolder = Runtime.GrabEdgeEvent(readActorId); + UNIT_ASSERT(eventHolder.Get() != nullptr); + } + + void ExpectSessionError(NActors::TActorId readActorId, ui64 partitionId) { + auto eventHolder = Runtime.GrabEdgeEvent(readActorId); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->Record.GetPartitionId() == partitionId); + } + + NActors::TActorId ExpectRegisterTopicSession() { + auto actorId = TestActorFactory->PopActorId(); + return actorId; + } + + void ProcessData(NActors::TActorId readActorId, ui64 partId, NActors::TActorId topicSessionActorId) { + MockNewDataArrived(partId, topicSessionActorId, readActorId); + ExpectNewDataArrived(readActorId, partId); + + MockGetNextBatch(partId, readActorId); + ExpectGetNextBatch(topicSessionActorId, partId); + + MockMessageBatch(partId, topicSessionActorId, readActorId); + ExpectMessageBatch(readActorId); + } + + TActorSystemStub actorSystemStub; + NActors::TTestActorRuntime Runtime; + NActors::TActorId RowDispatcher; + NActors::TActorId Coordinator; + NActors::TActorId EdgeActor; + NActors::TActorId ReadActorId1; + NActors::TActorId ReadActorId2; + TIntrusivePtr TestActorFactory; + + NYql::NPq::NProto::TDqPqTopicSource Source1 = BuildPqTopicSourceSettings("Endpoint1", "Database1", "topic"); + NYql::NPq::NProto::TDqPqTopicSource Source2 = BuildPqTopicSourceSettings("Endpoint2", "Database1", "topic"); + + ui64 PartitionId0 = 0; + ui64 PartitionId1 = 1; +}; + +Y_UNIT_TEST_SUITE(RowDispatcherTests) { + Y_UNIT_TEST_F(OneClientOneSession, TFixture) { + MockAddSession(Source1, PartitionId0, ReadActorId1); + auto topicSessionId = ExpectRegisterTopicSession(); + ExpectStartSessionAck(ReadActorId1); + ExpectStartSession(topicSessionId); + + ProcessData(ReadActorId1, PartitionId0, topicSessionId); + + MockStopSession(Source1, PartitionId0, ReadActorId1); + ExpectStopSession(topicSessionId, PartitionId0); + } + + Y_UNIT_TEST_F(TwoClientOneSession, TFixture) { + MockAddSession(Source1, PartitionId0, ReadActorId1); + auto topicSessionId = ExpectRegisterTopicSession(); + ExpectStartSessionAck(ReadActorId1); + ExpectStartSession(topicSessionId); + + MockAddSession(Source1, PartitionId0, ReadActorId2); + ExpectStartSessionAck(ReadActorId2); + ExpectStartSession(topicSessionId); + + ProcessData(ReadActorId1, PartitionId0, topicSessionId); + ProcessData(ReadActorId2, PartitionId0, topicSessionId); + + MockSessionError(PartitionId0, topicSessionId, ReadActorId1); + ExpectSessionError(ReadActorId1, PartitionId0); + + MockSessionError(PartitionId0, topicSessionId, ReadActorId2); + ExpectSessionError(ReadActorId2, PartitionId0); + } + + Y_UNIT_TEST_F(SessionError, TFixture) { + MockAddSession(Source1, PartitionId0, ReadActorId1); + auto topicSessionId = ExpectRegisterTopicSession(); + ExpectStartSessionAck(ReadActorId1); + ExpectStartSession(topicSessionId); + + MockSessionError(PartitionId0, topicSessionId, ReadActorId1); + ExpectSessionError(ReadActorId1, PartitionId0); + } + + Y_UNIT_TEST_F(CoordinatorSubscribe, TFixture) { + Runtime.Send(new IEventHandle(RowDispatcher, EdgeActor, new NFq::TEvRowDispatcher::TEvCoordinatorChanged(Coordinator))); + Runtime.Send(new IEventHandle(RowDispatcher, ReadActorId1, new NFq::TEvRowDispatcher::TEvCoordinatorChangesSubscribe)); + + auto eventHolder = Runtime.GrabEdgeEvent(ReadActorId1); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->CoordinatorActorId == Coordinator); + } + + Y_UNIT_TEST_F(CoordinatorSubscribeBeforeCoordinatorChanged, TFixture) { + Runtime.Send(new IEventHandle(RowDispatcher, ReadActorId1, new NFq::TEvRowDispatcher::TEvCoordinatorChangesSubscribe)); + Runtime.Send(new IEventHandle(RowDispatcher, ReadActorId2, new NFq::TEvRowDispatcher::TEvCoordinatorChangesSubscribe)); + + Runtime.Send(new IEventHandle(RowDispatcher, EdgeActor, new NFq::TEvRowDispatcher::TEvCoordinatorChanged(Coordinator))); + + auto eventHolder = Runtime.GrabEdgeEvent(ReadActorId1); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->CoordinatorActorId == Coordinator); + + eventHolder = Runtime.GrabEdgeEvent(ReadActorId2); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->CoordinatorActorId == Coordinator); + } + + Y_UNIT_TEST_F(TwoClients4Sessions, TFixture) { + + MockAddSession(Source1, PartitionId0, ReadActorId1); + auto topicSession1 = ExpectRegisterTopicSession(); + ExpectStartSessionAck(ReadActorId1); + ExpectStartSession(topicSession1); + + MockAddSession(Source1, PartitionId1, ReadActorId1); + auto topicSession2 = ExpectRegisterTopicSession(); + ExpectStartSessionAck(ReadActorId1); + ExpectStartSession(topicSession2); + + MockAddSession(Source2, PartitionId0, ReadActorId2); + auto topicSession3 = ExpectRegisterTopicSession(); + ExpectStartSessionAck(ReadActorId2); + ExpectStartSession(topicSession3); + + MockAddSession(Source2, PartitionId1, ReadActorId2); + auto topicSession4 = ExpectRegisterTopicSession(); + ExpectStartSessionAck(ReadActorId2); + ExpectStartSession(topicSession4); + + ProcessData(ReadActorId1, PartitionId0, topicSession1); + ProcessData(ReadActorId1, PartitionId1, topicSession2); + ProcessData(ReadActorId2, PartitionId0, topicSession3); + ProcessData(ReadActorId2, PartitionId1, topicSession4); + + MockSessionError(PartitionId0, topicSession1, ReadActorId1); + ExpectSessionError(ReadActorId1, PartitionId0); + + ProcessData(ReadActorId1, PartitionId1, topicSession2); + ProcessData(ReadActorId2, PartitionId0, topicSession3); + ProcessData(ReadActorId2, PartitionId1, topicSession4); + + MockStopSession(Source1, PartitionId1, ReadActorId1); + ExpectStopSession(topicSession2, PartitionId1); + + MockStopSession(Source2, PartitionId0, ReadActorId2); + ExpectStopSession(topicSession3, PartitionId0); + + MockStopSession(Source2, PartitionId1, ReadActorId2); + ExpectStopSession(topicSession4, PartitionId1); + + // Ignore data after StopSession + MockMessageBatch(PartitionId1, topicSession4, ReadActorId2); + } +} + +} + diff --git a/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp new file mode 100644 index 000000000000..ba24378e0a35 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp @@ -0,0 +1,357 @@ +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace { + +using namespace NKikimr; +using namespace NFq; +using namespace NYql::NDq; + +const ui64 TimeoutBeforeStartSessionSec = 3; +const ui64 GrabTimeoutSec = 4 * TimeoutBeforeStartSessionSec; + +class TFixture : public NUnitTest::TBaseFixture { + +public: + TFixture() + : Runtime(true) {} + + void SetUp(NUnitTest::TTestContext&) override { + TAutoPtr app = new TAppPrepare(); + Runtime.Initialize(app->Unwrap()); + Runtime.SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_TRACE); + Runtime.SetDispatchTimeout(TDuration::Seconds(5)); + + ReadActorId1 = Runtime.AllocateEdgeActor(); + ReadActorId2 = Runtime.AllocateEdgeActor(); + RowDispatcherActorId = Runtime.AllocateEdgeActor(); + } + + void Init(const TString& topicPath, ui64 maxSessionUsedMemory = std::numeric_limits::max()) { + Config.SetTimeoutBeforeStartSessionSec(TimeoutBeforeStartSessionSec); + Config.SetMaxSessionUsedMemory(maxSessionUsedMemory); + Config.SetSendStatusPeriodSec(2); + Config.SetWithoutConsumer(true); + + TopicSession = Runtime.Register(NewTopicSession( + topicPath, + Config, + RowDispatcherActorId, + 0, + Driver, + CredentialsProviderFactory, + MakeIntrusive() + ).release()); + Runtime.EnableScheduleForActor(TopicSession); + + TDispatchOptions options; + options.FinalEvents.emplace_back(NActors::TEvents::TSystem::Bootstrap, 1); + UNIT_ASSERT(Runtime.DispatchEvents(options)); + } + + void TearDown(NUnitTest::TTestContext& /* context */) override { + } + + void StartSession(TActorId readActorId, const NYql::NPq::NProto::TDqPqTopicSource& source, TMaybe readOffset = Nothing()) { + auto event = new NFq::TEvRowDispatcher::TEvStartSession( + source, + PartitionId, + "Token", + readOffset, // readOffset, + 0, // StartingMessageTimestamp; + "QueryId"); + Runtime.Send(new IEventHandle(TopicSession, readActorId, event)); + } + + NYql::NPq::NProto::TDqPqTopicSource BuildSource(TString topic, bool emptyPredicate = false) { + NYql::NPq::NProto::TDqPqTopicSource settings; + settings.SetEndpoint(GetDefaultPqEndpoint()); + settings.SetTopicPath(topic); + settings.SetConsumerName("PqConsumer"); + settings.MutableToken()->SetName("token"); + settings.SetDatabase(GetDefaultPqDatabase()); + settings.AddColumns("dt"); + settings.AddColumns("value"); + settings.AddColumnTypes("Uint64"); + settings.AddColumnTypes("String"); + if (!emptyPredicate) { + settings.SetPredicate("WHERE true"); + } + return settings; + } + + void StopSession(NActors::TActorId readActorId, const NYql::NPq::NProto::TDqPqTopicSource& source) { + auto event = std::make_unique(); + *event->Record.MutableSource() = source; + event->Record.SetPartitionId(PartitionId); + Runtime.Send(new IEventHandle(TopicSession, readActorId, event.release())); + } + + void ExpectMessageBatch(NActors::TActorId readActorId, const std::vector& expected) { + auto eventHolder = Runtime.GrabEdgeEvent(RowDispatcherActorId, TDuration::Seconds(GrabTimeoutSec)); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->ReadActorId == readActorId); + UNIT_ASSERT(expected.size() == eventHolder->Get()->Record.MessagesSize()); + for (size_t i = 0; i < expected.size(); ++i) { + NFq::NRowDispatcherProto::TEvMessage message = eventHolder->Get()->Record.GetMessages(i); + std::cerr << "message.GetJson() " << message.GetJson() << std::endl; + UNIT_ASSERT(expected[i] == message.GetJson()); + } + } + + void ExpectSessionError(NActors::TActorId readActorId, TString message) { + auto eventHolder = Runtime.GrabEdgeEvent(RowDispatcherActorId, TDuration::Seconds(GrabTimeoutSec)); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->ReadActorId == readActorId); + UNIT_ASSERT(TString(eventHolder->Get()->Record.GetMessage()).Contains(message)); + } + + void ExpectNewDataArrived(TSet readActorIds) { + size_t count = readActorIds.size(); + for (size_t i = 0; i < count; ++i) { + auto eventHolder = Runtime.GrabEdgeEvent(RowDispatcherActorId, TDuration::Seconds(GrabTimeoutSec)); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(readActorIds.contains(eventHolder->Get()->ReadActorId)); + readActorIds.erase(eventHolder->Get()->ReadActorId); + } + } + + size_t ReadMessages(NActors::TActorId readActorId) { + Runtime.Send(new IEventHandle(TopicSession, readActorId, new TEvRowDispatcher::TEvGetNextBatch())); + auto eventHolder = Runtime.GrabEdgeEvent(RowDispatcherActorId, TDuration::Seconds(GrabTimeoutSec)); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->ReadActorId == readActorId); + return eventHolder->Get()->Record.MessagesSize(); + } + + TActorSystemStub actorSystemStub; + NActors::TTestActorRuntime Runtime; + NActors::TActorId TopicSession; + NActors::TActorId RowDispatcherActorId; + NYdb::TDriver Driver = NYdb::TDriver(NYdb::TDriverConfig().SetLog(CreateLogBackend("cerr"))); + std::shared_ptr CredentialsProviderFactory; + NActors::TActorId ReadActorId1; + NActors::TActorId ReadActorId2; + ui64 PartitionId = 0; + NConfig::TRowDispatcherConfig Config; + + const TString Json1 = "{\"dt\":100,\"value\":\"value1\"}"; + const TString Json2 = "{\"dt\":200,\"value\":\"value2\"}"; + const TString Json3 = "{\"dt\":300,\"value\":\"value3\"}"; + const TString Json4 = "{\"dt\":400,\"value\":\"value4\"}"; +}; + +Y_UNIT_TEST_SUITE(TopicSessionTests) { + Y_UNIT_TEST_F(TwoSessionsWithoutOffsets, TFixture) { + const TString topicName = "topic1"; + PQCreateStream(topicName); + Init(topicName); + auto source = BuildSource(topicName); + StartSession(ReadActorId1, source); + StartSession(ReadActorId2, source); + + const std::vector data = { Json1 }; + PQWrite(data, topicName); + ExpectNewDataArrived({ReadActorId1, ReadActorId2}); + Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); + Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); + ExpectMessageBatch(ReadActorId1, { Json1 }); + ExpectMessageBatch(ReadActorId2, { Json1 }); + + StopSession(ReadActorId1, source); + StopSession(ReadActorId2, source); + } + + Y_UNIT_TEST_F(SessionWithPredicateAndSessionWithoutPredicate, TFixture) { + const TString topicName = "topic2"; + PQCreateStream(topicName); + Init(topicName); + auto source1 = BuildSource(topicName, false); + auto source2 = BuildSource(topicName, true); + StartSession(ReadActorId1, source1); + StartSession(ReadActorId2, source2); + + const std::vector data = { Json1 }; + PQWrite(data, topicName); + ExpectNewDataArrived({ReadActorId1, ReadActorId2}); + Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); + Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); + ExpectMessageBatch(ReadActorId1, { Json1 }); + ExpectMessageBatch(ReadActorId2, { Json1 }); + + StopSession(ReadActorId1, source1); + StopSession(ReadActorId2, source2); + } + + Y_UNIT_TEST_F(SecondSessionWithoutOffsetsAfterSessionConnected, TFixture) { + const TString topicName = "topic3"; + PQCreateStream(topicName); + Init(topicName); + auto source = BuildSource(topicName); + StartSession(ReadActorId1, source); + + const std::vector data = { Json1 }; + PQWrite(data, topicName); + ExpectNewDataArrived({ReadActorId1}); + Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); + ExpectMessageBatch(ReadActorId1, data); + + StartSession(ReadActorId2, source); + + const std::vector data2 = { Json2 }; + PQWrite(data2, topicName); + ExpectNewDataArrived({ReadActorId1, ReadActorId2}); + + Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); + ExpectMessageBatch(ReadActorId1, data2); + Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); + ExpectMessageBatch(ReadActorId2, data2); + + StopSession(ReadActorId1, source); + StopSession(ReadActorId2, source); + } + + Y_UNIT_TEST_F(TwoSessionsWithOffsets, TFixture) { + const TString topicName = "topic4"; + PQCreateStream(topicName); + Init(topicName); + auto source = BuildSource(topicName); + const std::vector data = { Json1, Json2, Json3}; + PQWrite(data, topicName); + + StartSession(ReadActorId1, source, 1); + StartSession(ReadActorId2, source, 2); + + ExpectNewDataArrived({ReadActorId1, ReadActorId2}); + Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); + std::vector expected1 = { Json2, Json3}; + ExpectMessageBatch(ReadActorId1, expected1); + + Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); + std::vector expected2 = { Json3 }; + ExpectMessageBatch(ReadActorId2, expected2); + + const std::vector data2 = { Json4 }; + PQWrite(data2, topicName); + ExpectNewDataArrived({ReadActorId1, ReadActorId2}); + Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); + ExpectMessageBatch(ReadActorId1, data2); + + Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); + ExpectMessageBatch(ReadActorId2, data2); + + StopSession(ReadActorId1, source); + StopSession(ReadActorId2, source); + } + + Y_UNIT_TEST_F(BadDataSessionError, TFixture) { + const TString topicName = "topic5"; + PQCreateStream(topicName); + Init(topicName); + auto source = BuildSource(topicName); + StartSession(ReadActorId1, source); + + const std::vector data = { "not json", "noch einmal / nicht json" }; + PQWrite(data, topicName); + + ExpectSessionError(ReadActorId1, "DB::ParsingException: Cannot parse input: expected '{' before: 'not json': (at row 1)"); + StopSession(ReadActorId1, source); + } + + Y_UNIT_TEST_F(RestartSessionIfNewClientWithOffset, TFixture) { + const TString topicName = "topic6"; + PQCreateStream(topicName); + Init(topicName); + auto source = BuildSource(topicName); + StartSession(ReadActorId1, source); + + const std::vector data = { Json1, Json2 }; // offset 0, 1 + PQWrite(data, topicName); + ExpectNewDataArrived({ReadActorId1}); + Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); + ExpectMessageBatch(ReadActorId1, data); + + // Restart topic session. + StartSession(ReadActorId2, source, 1); + ExpectNewDataArrived({ReadActorId2}); + + PQWrite({ Json3 }, topicName); + ExpectNewDataArrived({ReadActorId1}); + + Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); + ExpectMessageBatch(ReadActorId1, { Json3 }); + + Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); + ExpectMessageBatch(ReadActorId2, { Json2, Json3 }); + + StopSession(ReadActorId1, source); + StopSession(ReadActorId2, source); + } + + Y_UNIT_TEST_F(ReadNonExistentTopic, TFixture) { + const TString topicName = "topic7"; + Init(topicName); + auto source = BuildSource(topicName); + StartSession(ReadActorId1, source); + ExpectSessionError(ReadActorId1, "no path"); + StopSession(ReadActorId1, source); + } + + Y_UNIT_TEST_F(SlowSession, TFixture) { + const TString topicName = "topic8"; + PQCreateStream(topicName); + Init(topicName, 50); + auto source = BuildSource(topicName); + StartSession(ReadActorId1, source); + StartSession(ReadActorId2, source); + + size_t messagesSize = 5; + for (size_t i = 0; i < messagesSize; ++i) { + const std::vector data = { Json1 }; + PQWrite(data, topicName); + } + ExpectNewDataArrived({ReadActorId1, ReadActorId2}); + + auto readMessages = ReadMessages(ReadActorId1); + UNIT_ASSERT(readMessages == messagesSize); + + // Reading from yds is stopped. + + for (size_t i = 0; i < messagesSize; ++i) { + const std::vector data = { Json1 }; + PQWrite(data, topicName); + } + Sleep(TDuration::MilliSeconds(100)); + Runtime.DispatchEvents({}, Runtime.GetCurrentTime() - TDuration::MilliSeconds(1)); + + readMessages = ReadMessages(ReadActorId1); + UNIT_ASSERT(readMessages == 0); + + readMessages = ReadMessages(ReadActorId2); + UNIT_ASSERT(readMessages == messagesSize); + + Sleep(TDuration::MilliSeconds(100)); + Runtime.DispatchEvents({}, Runtime.GetCurrentTime() - TDuration::MilliSeconds(1)); + + readMessages = ReadMessages(ReadActorId1); + UNIT_ASSERT(readMessages == messagesSize); + + readMessages = ReadMessages(ReadActorId2); + UNIT_ASSERT(readMessages == messagesSize); + + StopSession(ReadActorId1, source); + StopSession(ReadActorId2, source); + } +} + +} + diff --git a/ydb/core/fq/libs/row_dispatcher/ut/ya.make b/ydb/core/fq/libs/row_dispatcher/ut/ya.make new file mode 100644 index 000000000000..25242d092f28 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/ut/ya.make @@ -0,0 +1,30 @@ +UNITTEST_FOR(ydb/core/fq/libs/row_dispatcher) + +INCLUDE(${ARCADIA_ROOT}/ydb/tests/tools/fq_runner/ydb_runner_with_datastreams.inc) + +SRCS( + coordinator_ut.cpp + json_filter_ut.cpp + json_parser_ut.cpp + leader_election_ut.cpp + row_dispatcher_ut.cpp + topic_session_ut.cpp +) + +PEERDIR( + library/cpp/testing/unittest + ydb/core/fq/libs/row_dispatcher + ydb/core/testlib + ydb/core/testlib/actors + ydb/library/yql/udfs/common/json2 + ydb/library/yql/udfs/common/yson2 + ydb/tests/fq/pq_async_io + ydb/library/yql/sql/pg_dummy + ydb/library/yql/udfs/common/clickhouse/client +) + +SIZE(MEDIUM) + +YQL_LAST_ABI_VERSION() + +END() diff --git a/ydb/core/fq/libs/row_dispatcher/ya.make b/ydb/core/fq/libs/row_dispatcher/ya.make new file mode 100644 index 000000000000..f1f036d20dc0 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/ya.make @@ -0,0 +1,39 @@ +LIBRARY() + +SRCS( + actors_factory.cpp + coordinator.cpp + json_filter.cpp + json_parser.cpp + leader_election.cpp + row_dispatcher_service.cpp + row_dispatcher.cpp + topic_session.cpp +) + +PEERDIR( + contrib/libs/fmt + ydb/core/fq/libs/actors/logging + ydb/core/fq/libs/config/protos + ydb/core/fq/libs/control_plane_storage + ydb/core/fq/libs/row_dispatcher/events + ydb/core/fq/libs/shared_resources + ydb/core/fq/libs/ydb + ydb/library/actors/core + ydb/library/security + ydb/library/yql/dq/actors/common + ydb/library/yql/dq/actors/compute + ydb/library/yql/dq/proto + ydb/library/yql/providers/pq/provider + ydb/library/yql/public/purecalc/common/no_pg_wrapper + ydb/public/sdk/cpp/client/ydb_scheme + ydb/public/sdk/cpp/client/ydb_table +) + +YQL_LAST_ABI_VERSION() + +END() + +RECURSE_FOR_TESTS( + ut +) diff --git a/ydb/core/fq/libs/ya.make b/ydb/core/fq/libs/ya.make index cf1038e7f404..34d75300f893 100644 --- a/ydb/core/fq/libs/ya.make +++ b/ydb/core/fq/libs/ya.make @@ -29,6 +29,7 @@ RECURSE( rate_limiter read_rule result_formatter + row_dispatcher shared_resources signer tasks_packer diff --git a/ydb/core/kqp/query_compiler/kqp_query_compiler.cpp b/ydb/core/kqp/query_compiler/kqp_query_compiler.cpp index 236907c304c0..e00acf0815a7 100644 --- a/ydb/core/kqp/query_compiler/kqp_query_compiler.cpp +++ b/ydb/core/kqp/query_compiler/kqp_query_compiler.cpp @@ -1020,7 +1020,7 @@ class TKqpQueryCompiler : public IKqpQueryCompiler { google::protobuf::Any& settings = *externalSource.MutableSettings(); TString& sourceType = *externalSource.MutableType(); - dqIntegration->FillSourceSettings(source.Ref(), settings, sourceType, maxTasksPerStage); + dqIntegration->FillSourceSettings(source.Ref(), settings, sourceType, maxTasksPerStage, ctx); YQL_ENSURE(!settings.type_url().empty(), "Data source provider \"" << dataSourceCategory << "\" didn't fill dq source settings for its dq source node"); YQL_ENSURE(sourceType, "Data source provider \"" << dataSourceCategory << "\" didn't fill dq source settings type for its dq source node"); } diff --git a/ydb/library/services/services.proto b/ydb/library/services/services.proto index 383df604b5f4..8198603a909b 100644 --- a/ydb/library/services/services.proto +++ b/ydb/library/services/services.proto @@ -345,6 +345,9 @@ enum EServiceKikimr { DB_POOL = 1167; YTS = 1168; + OBJECT_STORAGE_INFERENCINATOR = 1169; + FQ_ROW_DISPATCHER = 1170; + // 1024 - 1099 is reserved for nbs // Change exchange (async indexes & CDC) diff --git a/ydb/library/yql/dq/actors/compute/retry_queue.cpp b/ydb/library/yql/dq/actors/common/retry_queue.cpp similarity index 60% rename from ydb/library/yql/dq/actors/compute/retry_queue.cpp rename to ydb/library/yql/dq/actors/common/retry_queue.cpp index 243510dda242..1209f954d5e5 100644 --- a/ydb/library/yql/dq/actors/compute/retry_queue.cpp +++ b/ydb/library/yql/dq/actors/common/retry_queue.cpp @@ -1,15 +1,24 @@ #include "retry_queue.h" #include +#include namespace NYql::NDq { -void TRetryEventsQueue::Init(const TTxId& txId, const NActors::TActorId& senderId, const NActors::TActorId& selfId, ui64 eventQueueId) { +const ui64 PingPeriodSeconds = 2; + +void TRetryEventsQueue::Init( + const TTxId& txId, + const NActors::TActorId& senderId, + const NActors::TActorId& selfId, + ui64 eventQueueId, + bool keepAlive) { TxId = txId; SenderId = senderId; SelfId = selfId; Y_ASSERT(SelfId.NodeId() == SenderId.NodeId()); EventQueueId = eventQueueId; + KeepAlive = keepAlive; } void TRetryEventsQueue::OnNewRecipientId(const NActors::TActorId& recipientId, bool unsubscribe) { @@ -44,6 +53,9 @@ void TRetryEventsQueue::HandleNodeConnected(ui32 nodeId) { SendRetryable(ev); } } + if (KeepAlive) { + SchedulePing(); + } } } @@ -54,6 +66,14 @@ bool TRetryEventsQueue::HandleUndelivered(NActors::TEvents::TEvUndelivered::TPtr return true; } + if (ev->Sender == RecipientId && ev->Get()->Reason == NActors::TEvents::TEvUndelivered::ReasonActorUnknown) { + if (KeepAlive) { + NActors::TActivationContext::Send( + new NActors::IEventHandle(SelfId, SelfId, new TEvRetryQueuePrivate::TEvSessionClosed(EventQueueId), 0, 0)); + } + return true; + } + return false; } @@ -64,10 +84,28 @@ void TRetryEventsQueue::Retry() { } } +void TRetryEventsQueue::Ping() { + PingScheduled = false; + + if (!Connected) { + return; + } + + if (TInstant::Now() - LastReceivedDataTime < TDuration::Seconds(PingPeriodSeconds)) { + SchedulePing(); + return; + } + + auto ev = MakeHolder(); + NActors::TActivationContext::Send(new NActors::IEventHandle(RecipientId, SenderId, ev.Release(), NActors::IEventHandle::FlagTrackDelivery)); + SchedulePing(); +} + void TRetryEventsQueue::Connect() { auto connectEvent = MakeHolder(); + auto proxyId = NActors::TActivationContext::InterconnectProxy(RecipientId.NodeId()); NActors::TActivationContext::Send( - new NActors::IEventHandle(NActors::TActivationContext::InterconnectProxy(RecipientId.NodeId()), SenderId, connectEvent.Release(), 0, 0)); + new NActors::IEventHandle(proxyId, SenderId, connectEvent.Release(), 0, 0)); } void TRetryEventsQueue::Unsubscribe() { @@ -97,14 +135,25 @@ void TRetryEventsQueue::SendRetryable(const IRetryableEvent::TPtr& ev) { } void TRetryEventsQueue::ScheduleRetry() { - if (!RetryScheduled && !Events.empty()) { - RetryScheduled = true; - if (!RetryState) { - RetryState.ConstructInPlace(); - } - auto ev = MakeHolder(EventQueueId); - NActors::TActivationContext::Schedule(RetryState->GetNextDelay(), new NActors::IEventHandle(SelfId, SelfId, ev.Release())); + if (RetryScheduled) { + return; + } + RetryScheduled = true; + if (!RetryState) { + RetryState.ConstructInPlace(); } + auto ev = MakeHolder(EventQueueId); + NActors::TActivationContext::Schedule(RetryState->GetNextDelay(), new NActors::IEventHandle(SelfId, SelfId, ev.Release())); +} + +void TRetryEventsQueue::SchedulePing() { + if (!KeepAlive || PingScheduled) { + return; + } + + PingScheduled = true; + auto ev = MakeHolder(EventQueueId); + NActors::TActivationContext::Schedule(TDuration::Seconds(PingPeriodSeconds), new NActors::IEventHandle(SelfId, SelfId, ev.Release())); } TDuration TRetryEventsQueue::TRetryState::GetNextDelay() { @@ -120,4 +169,10 @@ TDuration TRetryEventsQueue::TRetryState::RandomizeDelay(TDuration baseDelay) { return TDuration::FromValue(half + RandomNumber(half)); } +void TRetryEventsQueue::PrintInternalState(TStringStream& stream) const { + stream << "RetryQueue: id " << EventQueueId << ", NextSeqNo " + << NextSeqNo << ", MyConfirmedSeqNo " << MyConfirmedSeqNo << ", SeqNos " << ReceivedEventsSeqNos.size() << ", events size " << Events.size() << "\n"; +} + + } // namespace NYql::NDq diff --git a/ydb/library/yql/dq/actors/compute/retry_queue.h b/ydb/library/yql/dq/actors/common/retry_queue.h similarity index 87% rename from ydb/library/yql/dq/actors/compute/retry_queue.h rename to ydb/library/yql/dq/actors/common/retry_queue.h index 875aef00c99e..5c9a8f317bf3 100644 --- a/ydb/library/yql/dq/actors/compute/retry_queue.h +++ b/ydb/library/yql/dq/actors/common/retry_queue.h @@ -9,6 +9,7 @@ #include #include +#include namespace NYql::NDq { @@ -16,9 +17,9 @@ struct TEvRetryQueuePrivate { // Event ids. enum EEv : ui32 { EvBegin = EventSpaceBegin(NActors::TEvents::ES_PRIVATE), - EvRetry = EvBegin, - + EvPing, + EvSessionClosed, // recipientId does not exist anymore EvEnd }; @@ -29,9 +30,22 @@ struct TEvRetryQueuePrivate { struct TEvRetry : NActors::TEventLocal { explicit TEvRetry(ui64 eventQueueId) : EventQueueId(eventQueueId) - { - } + { } + const ui64 EventQueueId; + }; + struct TEvPing : NActors::TEventLocal { + explicit TEvPing(ui64 eventQueueId) + : EventQueueId(eventQueueId) + { } + const ui64 EventQueueId; + }; + + + struct TEvSessionClosed : NActors::TEventLocal { + explicit TEvSessionClosed(ui64 eventQueueId) + : EventQueueId(eventQueueId) + { } const ui64 EventQueueId; }; @@ -55,6 +69,7 @@ template concept TProtobufEventWithTransportMeta = TProtobufEvent && THasTransportMeta; class TRetryEventsQueue { + public: class IRetryableEvent : public TSimpleRefCount { public: @@ -64,7 +79,9 @@ class TRetryEventsQueue { virtual ui64 GetSeqNo() const = 0; }; - void Init(const TTxId& txId, const NActors::TActorId& senderId, const NActors::TActorId& selfId, ui64 eventQueueId = 0); + TRetryEventsQueue() {} + + void Init(const TTxId& txId, const NActors::TActorId& senderId, const NActors::TActorId& selfId, ui64 eventQueueId = 0, bool keepAlive = false); template void Send(T* ev, ui64 cookie = 0) { @@ -93,6 +110,7 @@ class TRetryEventsQueue { template bool OnEventReceived(const T* ev) { // Returns true if event was not processed (== it was received first time). + LastReceivedDataTime = TInstant::Now(); if (LocalRecipient) { return true; } @@ -120,7 +138,7 @@ class TRetryEventsQueue { } return false; } - + bool RemoveConfirmedEvents() { RemoveConfirmedEvents(MyConfirmedSeqNo); return !Events.empty(); @@ -131,7 +149,9 @@ class TRetryEventsQueue { void HandleNodeDisconnected(ui32 nodeId); bool HandleUndelivered(NActors::TEvents::TEvUndelivered::TPtr& ev); void Retry(); + void Ping(); void Unsubscribe(); + void PrintInternalState(TStringStream& stream) const; private: template @@ -144,6 +164,7 @@ class TRetryEventsQueue { void RemoveConfirmedEvents(ui64 confirmedSeqNo); void SendRetryable(const IRetryableEvent::TPtr& ev); void ScheduleRetry(); + void SchedulePing(); void Connect(); private: @@ -199,8 +220,11 @@ class TRetryEventsQueue { std::set ReceivedEventsSeqNos; bool Connected = false; bool RetryScheduled = false; + bool PingScheduled = false; TMaybe RetryState; TTxId TxId; + bool KeepAlive = false; + TInstant LastReceivedDataTime = TInstant::Now(); }; } // namespace NYql::NDq diff --git a/ydb/library/yql/dq/actors/common/ut/retry_events_queue_ut.cpp b/ydb/library/yql/dq/actors/common/ut/retry_events_queue_ut.cpp new file mode 100644 index 000000000000..a1b45ccf34bf --- /dev/null +++ b/ydb/library/yql/dq/actors/common/ut/retry_events_queue_ut.cpp @@ -0,0 +1,191 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace NActors; +using namespace NYql::NDq; + +namespace { + +const ui64 EventQueueId = 777; + +struct TEvPrivate { + // Event ids + enum EEv : ui32 { + EvBegin = EventSpaceBegin(NActors::TEvents::ES_PRIVATE), + EvSend = EvBegin + 10, + EvData, + EvDisconnect, + EvEnd + }; + static_assert(EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE), "expect EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE)"); + struct TEvSend : public TEventLocal {}; + struct TEvData : public TEventLocal {}; + struct TEvDisconnect : public TEventLocal {}; +}; + + +class ClientActor : public TActorBootstrapped { +public: + ClientActor( + NActors::TActorId clientEdgeActorId, + NActors::TActorId serverActorId) + : ServerActorId(serverActorId) + , ClientEdgeActorId(clientEdgeActorId) {} + + void Bootstrap() { + Become(&ClientActor::StateFunc); + Init(); + } + + void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvRetry::TPtr& ) { + EventsQueue.Retry(); + } + + void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvPing::TPtr& ) { + EventsQueue.Ping(); + } + + void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed::TPtr& ) { + Send(ClientEdgeActorId, new TEvPrivate::TEvDisconnect()); + } + + void Handle(const TEvPrivate::TEvSend::TPtr& ) { + EventsQueue.Send(new TEvDqCompute::TEvInjectCheckpoint()); + } + + void HandleDisconnected(TEvInterconnect::TEvNodeDisconnected::TPtr& ev) { + EventsQueue.HandleNodeDisconnected(ev->Get()->NodeId); + } + + void HandleConnected(TEvInterconnect::TEvNodeConnected::TPtr& ev) { + EventsQueue.HandleNodeConnected(ev->Get()->NodeId); + } + + void Handle(NActors::TEvents::TEvUndelivered::TPtr& ev) { + EventsQueue.HandleUndelivered(ev); + } + + STRICT_STFUNC(StateFunc, + hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvRetry, Handle); + hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvPing, Handle); + hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed, Handle); + hFunc(TEvPrivate::TEvSend, Handle); + hFunc(TEvInterconnect::TEvNodeConnected, HandleConnected); + hFunc(TEvInterconnect::TEvNodeDisconnected, HandleDisconnected); + hFunc(NActors::TEvents::TEvUndelivered, Handle); + ) + + void Init() { + EventsQueue.Init("TxId", SelfId(), SelfId(), EventQueueId, true /*KeepAlive*/); + EventsQueue.OnNewRecipientId(ServerActorId); + } + + NYql::NDq::TRetryEventsQueue EventsQueue; + NActors::TActorId ServerActorId; + NActors::TActorId ClientEdgeActorId; +}; + +class ServerActor : public TActorBootstrapped { +public: + ServerActor(NActors::TActorId serverEdgeActorId) + : ServerEdgeActorId(serverEdgeActorId) {} + + void Bootstrap() { + Become(&ServerActor::StateFunc); + } + + STRICT_STFUNC(StateFunc, + hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvRetry, Handle); + hFunc(TEvInterconnect::TEvNodeConnected, HandleConnected); + hFunc(TEvInterconnect::TEvNodeDisconnected, HandleDisconnected); + hFunc(NActors::TEvents::TEvUndelivered, Handle); + hFunc(TEvDqCompute::TEvInjectCheckpoint, Handle); + hFunc(TEvents::TEvPoisonPill, Handle); + ) + + void Handle(const TEvents::TEvPoisonPill::TPtr& ) { + PassAway(); + } + + void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvRetry::TPtr& ) { + EventsQueue.Retry(); + } + + void Handle(const TEvDqCompute::TEvInjectCheckpoint::TPtr& /*ev*/) { + Send(ServerEdgeActorId, new TEvDqCompute::TEvInjectCheckpoint()); + } + + void HandleDisconnected(TEvInterconnect::TEvNodeDisconnected::TPtr& ev) { + EventsQueue.HandleNodeDisconnected(ev->Get()->NodeId); + } + + void HandleConnected(TEvInterconnect::TEvNodeConnected::TPtr& ev) { + EventsQueue.HandleNodeConnected(ev->Get()->NodeId); + } + + void Handle(NActors::TEvents::TEvUndelivered::TPtr& ev) { + EventsQueue.HandleUndelivered(ev); + } + + NYql::NDq::TRetryEventsQueue EventsQueue; + NActors::TActorId ServerEdgeActorId; +}; + +struct TRuntime: public NActors::TTestBasicRuntime +{ +public: + TRuntime() + : NActors::TTestBasicRuntime(2, true){ + Initialize(NKikimr::TAppPrepare().Unwrap()); + SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_DEBUG); + + ClientEdgeActorId = AllocateEdgeActor(0); + ServerEdgeActorId = AllocateEdgeActor(1); + + Server = new ServerActor(ServerEdgeActorId); + ServerActorId = Register(Server, 1); + EnableScheduleForActor(ServerActorId, true); + + Client = new ClientActor(ClientEdgeActorId, ServerActorId); + ClientActorId = Register(Client, 0); + EnableScheduleForActor(ClientActorId, true); + } + + ClientActor* Client; + ServerActor* Server; + NActors::TActorId ClientActorId; + NActors::TActorId ServerActorId; + NActors::TActorId ClientEdgeActorId; + NActors::TActorId ServerEdgeActorId; +}; + +Y_UNIT_TEST_SUITE(TRetryEventsQueueTest) { + Y_UNIT_TEST(SendDisconnectAfterPoisonPill) { + TRuntime runtime; + + runtime.Send(new IEventHandle( + runtime.ClientActorId, + runtime.ClientEdgeActorId, + new TEvPrivate::TEvSend())); + + TEvDqCompute::TEvInjectCheckpoint::TPtr event = runtime.GrabEdgeEvent(runtime.ServerEdgeActorId); + UNIT_ASSERT(event); + + runtime.Send(runtime.ServerActorId, runtime.ServerEdgeActorId, new TEvents::TEvPoisonPill()); + + TEvPrivate::TEvDisconnect::TPtr disconnectEvent = runtime.GrabEdgeEvent(runtime.ClientEdgeActorId); + UNIT_ASSERT(disconnectEvent); + } +} + +} diff --git a/ydb/library/yql/dq/actors/common/ut/ya.make b/ydb/library/yql/dq/actors/common/ut/ya.make new file mode 100644 index 000000000000..bdc87264c395 --- /dev/null +++ b/ydb/library/yql/dq/actors/common/ut/ya.make @@ -0,0 +1,16 @@ +UNITTEST_FOR(ydb/library/yql/dq/actors/common) + +SRCS( + retry_events_queue_ut.cpp +) + +PEERDIR( + library/cpp/testing/unittest + ydb/core/testlib/actors + ydb/core/testlib + ydb/library/yql/sql/pg_dummy +) + +YQL_LAST_ABI_VERSION() + +END() diff --git a/ydb/library/yql/dq/actors/common/ya.make b/ydb/library/yql/dq/actors/common/ya.make new file mode 100644 index 000000000000..9311daae3a9f --- /dev/null +++ b/ydb/library/yql/dq/actors/common/ya.make @@ -0,0 +1,19 @@ +LIBRARY() + +SRCS( + retry_queue.cpp +) + +PEERDIR( + ydb/library/actors/core + ydb/library/yql/dq/actors/protos + ydb/library/yql/public/issue +) + +YQL_LAST_ABI_VERSION() + +END() + +RECURSE_FOR_TESTS( + ut +) diff --git a/ydb/library/yql/dq/actors/compute/dq_compute_actor_checkpoints.h b/ydb/library/yql/dq/actors/compute/dq_compute_actor_checkpoints.h index 21bc494fef8a..27a4ad7a8407 100644 --- a/ydb/library/yql/dq/actors/compute/dq_compute_actor_checkpoints.h +++ b/ydb/library/yql/dq/actors/compute/dq_compute_actor_checkpoints.h @@ -2,8 +2,8 @@ #include "dq_compute_actor.h" #include "dq_compute_actor_async_io.h" -#include "retry_queue.h" +#include #include #include diff --git a/ydb/library/yql/dq/actors/compute/ya.make b/ydb/library/yql/dq/actors/compute/ya.make index 3cec159c2462..018af068e57a 100644 --- a/ydb/library/yql/dq/actors/compute/ya.make +++ b/ydb/library/yql/dq/actors/compute/ya.make @@ -11,7 +11,6 @@ SRCS( dq_compute_actor_watermarks.cpp dq_compute_actor.cpp dq_compute_issues_buffer.cpp - retry_queue.cpp dq_request_context.h dq_request_context.cpp ) @@ -21,11 +20,12 @@ PEERDIR( ydb/library/actors/wilson/protos ydb/library/services ydb/library/ydb_issue/proto + ydb/library/yql/dq/actors/common + ydb/library/yql/dq/actors/spilling ydb/library/yql/dq/common ydb/library/yql/dq/proto ydb/library/yql/dq/runtime ydb/library/yql/dq/tasks - ydb/library/yql/dq/actors/spilling ydb/library/yql/minikql ydb/library/yql/minikql/comp_nodes ydb/library/yql/public/issue diff --git a/ydb/library/yql/dq/integration/yql_dq_integration.h b/ydb/library/yql/dq/integration/yql_dq_integration.h index 67f317477181..644ebb5583a8 100644 --- a/ydb/library/yql/dq/integration/yql_dq_integration.h +++ b/ydb/library/yql/dq/integration/yql_dq_integration.h @@ -62,7 +62,7 @@ class IDqIntegration { virtual bool CanBlockRead(const NNodes::TExprBase& node, TExprContext& ctx, TTypeAnnotationContext& typesCtx) = 0; virtual void RegisterMkqlCompiler(NCommon::TMkqlCallableCompilerBase& compiler) = 0; virtual bool CanFallback() = 0; - virtual void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& settings, TString& sourceType, size_t maxPartitions) = 0; + virtual void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& settings, TString& sourceType, size_t maxPartitions, TExprContext& ctx) = 0; virtual void FillLookupSourceSettings(const TExprNode& node, ::google::protobuf::Any& settings, TString& sourceType) = 0; virtual void FillSinkSettings(const TExprNode& node, ::google::protobuf::Any& settings, TString& sinkType) = 0; virtual void FillTransformSettings(const TExprNode& node, ::google::protobuf::Any& settings) = 0; diff --git a/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_dq_integration.cpp b/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_dq_integration.cpp index f5aa18638eb5..0f997f1e4495 100644 --- a/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_dq_integration.cpp +++ b/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_dq_integration.cpp @@ -75,7 +75,7 @@ class TClickHouseDqIntegration: public TDqIntegrationBase { return 0ULL; } - void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& protoSettings, TString& sourceType, size_t) override { + void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& protoSettings, TString& sourceType, size_t, TExprContext&) override { const TDqSource source(&node); if (const auto maySettings = source.Settings().Maybe()) { const auto settings = maySettings.Cast(); diff --git a/ydb/library/yql/providers/common/dq/yql_dq_integration_impl.cpp b/ydb/library/yql/providers/common/dq/yql_dq_integration_impl.cpp index 04b343055415..d1fa2ca23219 100644 --- a/ydb/library/yql/providers/common/dq/yql_dq_integration_impl.cpp +++ b/ydb/library/yql/providers/common/dq/yql_dq_integration_impl.cpp @@ -51,7 +51,7 @@ bool TDqIntegrationBase::CanFallback() { return false; } -void TDqIntegrationBase::FillSourceSettings(const TExprNode&, ::google::protobuf::Any&, TString&, size_t) { +void TDqIntegrationBase::FillSourceSettings(const TExprNode&, ::google::protobuf::Any&, TString&, size_t, TExprContext&) { } void TDqIntegrationBase::FillLookupSourceSettings(const TExprNode& node, ::google::protobuf::Any& settings, TString& sourceType) { diff --git a/ydb/library/yql/providers/common/dq/yql_dq_integration_impl.h b/ydb/library/yql/providers/common/dq/yql_dq_integration_impl.h index facb55a68f64..0737f4791b83 100644 --- a/ydb/library/yql/providers/common/dq/yql_dq_integration_impl.h +++ b/ydb/library/yql/providers/common/dq/yql_dq_integration_impl.h @@ -18,7 +18,7 @@ class TDqIntegrationBase: public IDqIntegration { bool CanBlockRead(const NNodes::TExprBase& node, TExprContext& ctx, TTypeAnnotationContext& typesCtx) override; TExprNode::TPtr WrapWrite(const TExprNode::TPtr& write, TExprContext& ctx) override; bool CanFallback() override; - void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& settings, TString& sourceType, size_t) override; + void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& settings, TString& sourceType, size_t, TExprContext&) override; void FillLookupSourceSettings(const TExprNode& node, ::google::protobuf::Any& settings, TString& sourceType) override; void FillSinkSettings(const TExprNode& node, ::google::protobuf::Any& settings, TString& sinkType) override; void FillTransformSettings(const TExprNode& node, ::google::protobuf::Any& settings) override; diff --git a/ydb/library/yql/providers/common/proto/gateways_config.proto b/ydb/library/yql/providers/common/proto/gateways_config.proto index 61d9082f528f..dbb433a4e2f1 100644 --- a/ydb/library/yql/providers/common/proto/gateways_config.proto +++ b/ydb/library/yql/providers/common/proto/gateways_config.proto @@ -326,6 +326,7 @@ message TPqClusterConfig { optional bool AddBearerToToken = 11; // whether to use prefix "Bearer " in token optional string DatabaseId = 12; repeated TAttr Settings = 100; + optional bool SharedReading = 101; } message TPqGatewayConfig { diff --git a/ydb/library/yql/providers/common/pushdown/physical_opt.cpp b/ydb/library/yql/providers/common/pushdown/physical_opt.cpp new file mode 100644 index 000000000000..602ab6c8f0b8 --- /dev/null +++ b/ydb/library/yql/providers/common/pushdown/physical_opt.cpp @@ -0,0 +1,73 @@ +#include "predicate_node.h" + +#include +#include +#include +#include + +namespace NYql::NPushdown { + +using namespace NNodes; + +namespace { + +TPredicateNode SplitForPartialPushdown( + const NPushdown::TPredicateNode& predicateTree, + TExprContext& ctx, + TPositionHandle pos) { + if (predicateTree.CanBePushed) { + return predicateTree; + } + + if (predicateTree.Op != NPushdown::EBoolOp::And) { + return NPushdown::TPredicateNode(); // Not valid, => return the same node from optimizer + } + + std::vector pushable; + for (auto& predicate : predicateTree.Children) { + if (predicate.CanBePushed) { + pushable.emplace_back(predicate); + } + } + NPushdown::TPredicateNode predicateToPush; + predicateToPush.SetPredicates(pushable, ctx, pos); + return predicateToPush; +} + +} + +TMaybeNode MakePushdownPredicate(const TCoLambda& lambda, TExprContext& ctx, const TPositionHandle& pos, const TSettings& settings) { + auto lambdaArg = lambda.Args().Arg(0).Ptr(); + + YQL_LOG(TRACE) << "Push filter. Initial filter lambda: " << NCommon::ExprToPrettyString(ctx, lambda.Ref()); + + auto maybeOptionalIf = lambda.Body().Maybe(); + if (!maybeOptionalIf.IsValid()) { // Nothing to push + return {}; + } + + TCoOptionalIf optionalIf = maybeOptionalIf.Cast(); + NPushdown::TPredicateNode predicateTree(optionalIf.Predicate()); + NPushdown::CollectPredicates(optionalIf.Predicate(), predicateTree, lambdaArg.Get(), TExprBase(lambdaArg), settings); + YQL_ENSURE(predicateTree.IsValid(), "Collected filter predicates are invalid"); + + NPushdown::TPredicateNode predicateToPush = SplitForPartialPushdown(predicateTree, ctx, pos); + if (!predicateToPush.IsValid()) { + return {}; + } + + // clang-format off + auto newFilterLambda = Build(ctx, pos) + .Args({"filter_row"}) + .Body() + .Apply(predicateToPush.ExprNode.Cast()) + .With(TExprBase(lambdaArg), "filter_row") + .Build() + .Done(); + // clang-format on + + YQL_LOG(INFO) << "Push filter lambda: " << NCommon::ExprToPrettyString(ctx, *newFilterLambda.Ptr()); + return newFilterLambda; +} + +} // namespace NYql::NPushdown diff --git a/ydb/library/yql/providers/common/pushdown/physical_opt.h b/ydb/library/yql/providers/common/pushdown/physical_opt.h new file mode 100644 index 000000000000..f4a9bbe19068 --- /dev/null +++ b/ydb/library/yql/providers/common/pushdown/physical_opt.h @@ -0,0 +1,11 @@ +#pragma once + +#include +#include +#include + +namespace NYql::NPushdown { + +NNodes::TMaybeNode MakePushdownPredicate(const NNodes::TCoLambda& lambda, TExprContext& ctx, const TPositionHandle& pos, const TSettings& settings); + +} // namespace NYql::NPushdown diff --git a/ydb/library/yql/providers/common/pushdown/type_ann.cpp b/ydb/library/yql/providers/common/pushdown/type_ann.cpp new file mode 100644 index 000000000000..5ba21286a394 --- /dev/null +++ b/ydb/library/yql/providers/common/pushdown/type_ann.cpp @@ -0,0 +1,36 @@ +#include "predicate_node.h" + +#include +#include + +namespace NYql::NPushdown { + +IGraphTransformer::TStatus AnnotateFilterPredicate(const TExprNode::TPtr& input, size_t childIndex, const TStructExprType* itemType, TExprContext& ctx) { + if (childIndex >= input->ChildrenSize()) { + return IGraphTransformer::TStatus::Error; + } + + auto& filterLambda = input->ChildRef(childIndex); + if (!EnsureLambda(*filterLambda, ctx)) { + return IGraphTransformer::TStatus::Error; + } + + if (!UpdateLambdaAllArgumentsTypes(filterLambda, {itemType}, ctx)) { + return IGraphTransformer::TStatus::Error; + } + + if (const auto* filterLambdaType = filterLambda->GetTypeAnn()) { + if (filterLambdaType->GetKind() != ETypeAnnotationKind::Data) { + return IGraphTransformer::TStatus::Error; + } + const TDataExprType* dataExprType = static_cast(filterLambdaType); + if (dataExprType->GetSlot() != EDataSlot::Bool) { + return IGraphTransformer::TStatus::Error; + } + } else { + return IGraphTransformer::TStatus::Repeat; + } + return IGraphTransformer::TStatus::Ok; +} + +} // namespace NYql::NPushdown diff --git a/ydb/library/yql/providers/common/pushdown/type_ann.h b/ydb/library/yql/providers/common/pushdown/type_ann.h new file mode 100644 index 000000000000..4d879674fb0e --- /dev/null +++ b/ydb/library/yql/providers/common/pushdown/type_ann.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include +#include +#include + +namespace NYql::NPushdown { + +IGraphTransformer::TStatus AnnotateFilterPredicate( + const TExprNode::TPtr& input, + size_t childIndex, + const TStructExprType* itemType, + TExprContext& ctx); + + +} // namespace NYql::NPushdown diff --git a/ydb/library/yql/providers/common/pushdown/ya.make b/ydb/library/yql/providers/common/pushdown/ya.make index f488c383a940..626babe45a16 100644 --- a/ydb/library/yql/providers/common/pushdown/ya.make +++ b/ydb/library/yql/providers/common/pushdown/ya.make @@ -4,6 +4,8 @@ SRCS( collection.cpp predicate_node.cpp settings.cpp + type_ann.cpp + physical_opt.cpp ) PEERDIR( diff --git a/ydb/library/yql/providers/common/ut_helpers/dq_fake_ca.h b/ydb/library/yql/providers/common/ut_helpers/dq_fake_ca.h index 8d50cff70b66..2446f3202745 100644 --- a/ydb/library/yql/providers/common/ut_helpers/dq_fake_ca.h +++ b/ydb/library/yql/providers/common/ut_helpers/dq_fake_ca.h @@ -127,6 +127,7 @@ class TFakeActor : public NActors::TActor { public: IDqComputeActorAsyncInput* DqAsyncInput = nullptr; IDqComputeActorAsyncOutput* DqAsyncOutput = nullptr; + std::optional DqAsyncInputActorId; private: STRICT_STFUNC(StateFunc, @@ -164,7 +165,6 @@ class TFakeActor : public NActors::TActor { NKikimr::NMiniKQL::TDefaultValueBuilder ValueBuilder; private: - std::optional DqAsyncInputActorId; IActor* DqAsyncInputAsActor = nullptr; std::optional DqAsyncOutputActorId; diff --git a/ydb/library/yql/providers/dq/planner/execution_planner.cpp b/ydb/library/yql/providers/dq/planner/execution_planner.cpp index 69507575a78b..7eec1e4d2648 100644 --- a/ydb/library/yql/providers/dq/planner/execution_planner.cpp +++ b/ydb/library/yql/providers/dq/planner/execution_planner.cpp @@ -540,7 +540,7 @@ namespace NYql::NDqs { TString sourceType; if (dqSource) { sourceSettings.ConstructInPlace(); - dqIntegration->FillSourceSettings(*read, *sourceSettings, sourceType, maxPartitions); + dqIntegration->FillSourceSettings(*read, *sourceSettings, sourceType, maxPartitions, ExprContext); YQL_ENSURE(!sourceSettings->type_url().empty(), "Data source provider \"" << dataSourceName << "\" did't fill dq source settings for its dq source node"); YQL_ENSURE(sourceType, "Data source provider \"" << dataSourceName << "\" did't fill dq source settings type for its dq source node"); } diff --git a/ydb/library/yql/providers/generic/provider/ut/pushdown/pushdown_ut.cpp b/ydb/library/yql/providers/generic/provider/ut/pushdown/pushdown_ut.cpp index 48bb17d52670..937b5b0c60a4 100644 --- a/ydb/library/yql/providers/generic/provider/ut/pushdown/pushdown_ut.cpp +++ b/ydb/library/yql/providers/generic/provider/ut/pushdown/pushdown_ut.cpp @@ -180,7 +180,7 @@ class TBuildDqSourceSettingsTransformer: public TOptimizeTransformerBase { .Ptr(); ::google::protobuf::Any settings; TString sourceType; - dqIntegration->FillSourceSettings(*dqSourceNode, settings, sourceType, 1); + dqIntegration->FillSourceSettings(*dqSourceNode, settings, sourceType, 1, ctx); UNIT_ASSERT_STRINGS_EQUAL(sourceType, "PostgreSqlGeneric"); UNIT_ASSERT(settings.Is()); settings.UnpackTo(DqSourceSettings_); diff --git a/ydb/library/yql/providers/generic/provider/ya.make b/ydb/library/yql/providers/generic/provider/ya.make index 7d50ad5b7169..4d21e4fb0043 100644 --- a/ydb/library/yql/providers/generic/provider/ya.make +++ b/ydb/library/yql/providers/generic/provider/ya.make @@ -32,6 +32,7 @@ PEERDIR( library/cpp/json library/cpp/random_provider library/cpp/time_provider + ydb/core/fq/libs/common ydb/core/fq/libs/result_formatter ydb/library/yql/ast ydb/library/yql/core diff --git a/ydb/library/yql/providers/generic/provider/yql_generic_datasource_type_ann.cpp b/ydb/library/yql/providers/generic/provider/yql_generic_datasource_type_ann.cpp index a7b14b26cf2a..d7606a16c423 100644 --- a/ydb/library/yql/providers/generic/provider/yql_generic_datasource_type_ann.cpp +++ b/ydb/library/yql/providers/generic/provider/yql_generic_datasource_type_ann.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -47,34 +48,6 @@ namespace NYql { return TStatus::Ok; } - TStatus AnnotateFilterPredicate(const TExprNode::TPtr& input, size_t childIndex, const TStructExprType* itemType, TExprContext& ctx) { - if (childIndex >= input->ChildrenSize()) { - return TStatus::Error; - } - - auto& filterLambda = input->ChildRef(childIndex); - if (!EnsureLambda(*filterLambda, ctx)) { - return TStatus::Error; - } - - if (!UpdateLambdaAllArgumentsTypes(filterLambda, {itemType}, ctx)) { - return IGraphTransformer::TStatus::Error; - } - - if (const auto* filterLambdaType = filterLambda->GetTypeAnn()) { - if (filterLambdaType->GetKind() != ETypeAnnotationKind::Data) { - return IGraphTransformer::TStatus::Error; - } - const TDataExprType* dataExprType = static_cast(filterLambdaType); - if (dataExprType->GetSlot() != EDataSlot::Bool) { - return IGraphTransformer::TStatus::Error; - } - } else { - return IGraphTransformer::TStatus::Repeat; - } - return TStatus::Ok; - } - TStatus HandleSourceSettings(const TExprNode::TPtr& input, TExprContext& ctx) { if (!EnsureArgsCount(*input, 5, ctx)) { return TStatus::Error; @@ -123,7 +96,7 @@ namespace NYql { } // Filter - const TStatus filterAnnotationStatus = AnnotateFilterPredicate(input, TGenSourceSettings::idx_FilterPredicate, structExprType, ctx); + const TStatus filterAnnotationStatus = NYql::NPushdown::AnnotateFilterPredicate(input, TGenSourceSettings::idx_FilterPredicate, structExprType, ctx); if (filterAnnotationStatus != TStatus::Ok) { return filterAnnotationStatus; } @@ -204,7 +177,7 @@ namespace NYql { } // Filter - const TStatus filterAnnotationStatus = AnnotateFilterPredicate(input, TGenReadTable::idx_FilterPredicate, itemType, ctx); + const TStatus filterAnnotationStatus = NYql::NPushdown::AnnotateFilterPredicate(input, TGenReadTable::idx_FilterPredicate, itemType, ctx); if (filterAnnotationStatus != TStatus::Ok) { return filterAnnotationStatus; } diff --git a/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp b/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp index 24fab4c67637..ea093f871458 100644 --- a/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp +++ b/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp @@ -112,7 +112,7 @@ namespace NYql { } void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& protoSettings, - TString& sourceType, size_t) override { + TString& sourceType, size_t, TExprContext&) override { const TDqSource source(&node); if (const auto maybeSettings = source.Settings().Maybe()) { const auto settings = maybeSettings.Cast(); diff --git a/ydb/library/yql/providers/generic/provider/yql_generic_physical_opt.cpp b/ydb/library/yql/providers/generic/provider/yql_generic_physical_opt.cpp index f68261923e36..c7726ef4dc3b 100644 --- a/ydb/library/yql/providers/generic/provider/yql_generic_physical_opt.cpp +++ b/ydb/library/yql/providers/generic/provider/yql_generic_physical_opt.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -105,62 +106,6 @@ namespace NYql { return node; } - static NPushdown::TPredicateNode SplitForPartialPushdown(const NPushdown::TPredicateNode& predicateTree, - TExprContext& ctx, TPositionHandle pos) - { - if (predicateTree.CanBePushed) { - return predicateTree; - } - - if (predicateTree.Op != NPushdown::EBoolOp::And) { - return NPushdown::TPredicateNode(); // Not valid, => return the same node from optimizer - } - - std::vector pushable; - for (auto& predicate : predicateTree.Children) { - if (predicate.CanBePushed) { - pushable.emplace_back(predicate); - } - } - NPushdown::TPredicateNode predicateToPush; - predicateToPush.SetPredicates(pushable, ctx, pos); - return predicateToPush; - } - - TMaybeNode MakePushdownPredicate(const TCoLambda& lambda, TExprContext& ctx, const TPositionHandle& pos) const { - auto lambdaArg = lambda.Args().Arg(0).Ptr(); - - YQL_CLOG(TRACE, ProviderGeneric) << "Push filter. Initial filter lambda: " << NCommon::ExprToPrettyString(ctx, lambda.Ref()); - - auto maybeOptionalIf = lambda.Body().Maybe(); - if (!maybeOptionalIf.IsValid()) { // Nothing to push - return {}; - } - - TCoOptionalIf optionalIf = maybeOptionalIf.Cast(); - NPushdown::TPredicateNode predicateTree(optionalIf.Predicate()); - NPushdown::CollectPredicates(optionalIf.Predicate(), predicateTree, lambdaArg.Get(), TExprBase(lambdaArg), TPushdownSettings()); - YQL_ENSURE(predicateTree.IsValid(), "Collected filter predicates are invalid"); - - NPushdown::TPredicateNode predicateToPush = SplitForPartialPushdown(predicateTree, ctx, pos); - if (!predicateToPush.IsValid()) { - return {}; - } - - // clang-format off - auto newFilterLambda = Build(ctx, pos) - .Args({"filter_row"}) - .Body() - .Apply(predicateToPush.ExprNode.Cast()) - .With(TExprBase(lambdaArg), "filter_row") - .Build() - .Done(); - // clang-format on - - YQL_CLOG(INFO, ProviderGeneric) << "Push filter lambda: " << NCommon::ExprToPrettyString(ctx, *newFilterLambda.Ptr()); - return newFilterLambda; - } - TMaybeNode PushFilterToReadTable(TExprBase node, TExprContext& ctx) const { if (!State_->Configuration->UsePredicatePushdown.Get().GetOrElse(TGenericSettings::TDefault::UsePredicatePushdown)) { return node; @@ -182,7 +127,7 @@ namespace NYql { return node; } - auto newFilterLambda = MakePushdownPredicate(flatmap.Lambda(), ctx, node.Pos()); + auto newFilterLambda = NPushdown::MakePushdownPredicate(flatmap.Lambda(), ctx, node.Pos(), TPushdownSettings()); if (!newFilterLambda) { return node; } @@ -223,7 +168,7 @@ namespace NYql { return node; } - auto newFilterLambda = MakePushdownPredicate(flatmap.Lambda(), ctx, node.Pos()); + auto newFilterLambda = NPushdown::MakePushdownPredicate(flatmap.Lambda(), ctx, node.Pos(), TPushdownSettings()); if (!newFilterLambda) { return node; } diff --git a/ydb/library/yql/providers/generic/provider/yql_generic_predicate_pushdown.cpp b/ydb/library/yql/providers/generic/provider/yql_generic_predicate_pushdown.cpp index 540bd859f7ac..80ea550f6f4d 100644 --- a/ydb/library/yql/providers/generic/provider/yql_generic_predicate_pushdown.cpp +++ b/ydb/library/yql/providers/generic/provider/yql_generic_predicate_pushdown.cpp @@ -1,7 +1,7 @@ #include "yql_generic_predicate_pushdown.h" #include - +#include #include namespace NYql { @@ -9,6 +9,19 @@ namespace NYql { using namespace NNodes; using namespace NConnector::NApi; + TString FormatColumn(const TString& value); + TString FormatValue(const Ydb::TypedValue& value); + TString FormatNull(const TExpression_TNull&); + TString FormatExpression(const TExpression& expression); + TString FormatArithmeticalExpression(const TExpression_TArithmeticalExpression& expression); + TString FormatNegation(const TPredicate_TNegation& negation); + TString FormatComparison(const TPredicate_TComparison comparison); + TString FormatConjunction(const TPredicate_TConjunction& conjunction, bool topLevel); + TString FormatDisjunction(const TPredicate_TDisjunction& disjunction); + TString FormatIsNull(const TPredicate_TIsNull& isNull); + TString FormatIsNotNull(const TPredicate_TIsNotNull& isNotNull); + TString FormatPredicate(const TPredicate& predicate, bool topLevel); + namespace { bool SerializeMember(const TCoMember& member, TExpression* proto, const TCoArgument& arg, TStringBuilder& err) { @@ -185,7 +198,221 @@ namespace NYql { err << "unknown predicate: " << predicate.Raw()->Content(); return false; } + } + + TString FormatColumn(const TString& value) { + return NFq::EncloseAndEscapeString(value, '`'); + } + + TString FormatValue(const Ydb::TypedValue& value) { + switch (value.value().value_case()) { + case Ydb::Value::kBoolValue: + return ToString(value.value().bool_value()); + case Ydb::Value::kInt32Value: + return ToString(value.value().int32_value()); + case Ydb::Value::kUint32Value: + return ToString(value.value().uint32_value()); + case Ydb::Value::kInt64Value: + return ToString(value.value().int64_value()); + case Ydb::Value::kUint64Value: + return ToString(value.value().uint64_value()); + case Ydb::Value::kFloatValue: + return ToString(value.value().float_value()); + case Ydb::Value::kDoubleValue: + return ToString(value.value().double_value()); + case Ydb::Value::kBytesValue: + return NFq::EncloseAndEscapeString(value.value().bytes_value(), '"'); + case Ydb::Value::kTextValue: + return NFq::EncloseAndEscapeString(value.value().text_value(), '"'); + default: + throw yexception() << "ErrUnimplementedTypedValue, value case " << static_cast(value.value().value_case()); + } + } + + TString FormatNull(const TExpression_TNull&) { + return "NULL"; + } + + TString FormatExpression(const TExpression& expression) { + switch (expression.payload_case()) { + case TExpression::kColumn: + return FormatColumn(expression.column()); + case TExpression::kTypedValue: + return FormatValue(expression.typed_value()); + case TExpression::kArithmeticalExpression: + return FormatArithmeticalExpression(expression.arithmetical_expression()); + case TExpression::kNull: + return FormatNull(expression.null()); + default: + throw yexception() << "UnimplementedExpression, payload_case " << static_cast(expression.payload_case()); + } + } + + TString FormatArithmeticalExpression(const TExpression_TArithmeticalExpression& expression) { + TString operation; + switch (expression.operation()) { + case TExpression_TArithmeticalExpression::MUL: + operation = " * "; + break; + case TExpression_TArithmeticalExpression::ADD: + operation = " + "; + break; + case TExpression_TArithmeticalExpression::SUB: + operation = " - "; + break; + case TExpression_TArithmeticalExpression::BIT_AND: + operation = " & "; + break; + case TExpression_TArithmeticalExpression::BIT_OR: + operation = " | "; + break; + case TExpression_TArithmeticalExpression::BIT_XOR: + operation = " ^ "; + break; + default: + throw yexception() << "ErrUnimplementedArithmeticalExpression, operation " << static_cast(expression.operation()); + } + + auto left = FormatExpression(expression.left_value()); + auto right = FormatExpression(expression.right_value()); + return left + operation + right; + } + + TString FormatNegation(const TPredicate_TNegation& negation) { + auto pred = FormatPredicate(negation.operand(), false); + return "(NOT " + pred + ")"; + } + + TString FormatConjunction(const TPredicate_TConjunction& conjunction, bool /*topLevel*/) { + ui32 succeeded = 0; + TStringStream stream; + TString first; + + for (const auto& predicate : conjunction.operands()) { + auto statement = FormatPredicate(predicate, false); + + if (succeeded > 0) { + if (succeeded == 1) { + stream << "("; + stream << first; + } + stream << " AND "; + stream << statement; + } else { + first = statement; + } + succeeded++; + } + + if (succeeded == 0) { + throw yexception() << "failed to format AND statement, no operands"; + } + if (succeeded == 1) { + stream << first; + } else { + stream << ")"; + } + return stream.Str(); + } + + TString FormatDisjunction(const TPredicate_TDisjunction& disjunction) { + TStringStream stream; + TString first; + ui32 cnt = 0; + + for (const auto& predicate : disjunction.operands()) { + auto statement = FormatPredicate(predicate, false); + + if (cnt > 0) { + if (cnt == 1) { + stream << "("; + stream << first; + } + + stream << " OR "; + stream << statement; + } else { + first = statement; + } + cnt++; + } + + if (cnt == 0) { + throw yexception() << "failed to format OR statement: no operands"; + } + + if (cnt == 1) { + stream << first; + } else { + stream << ")"; + } + + return stream.Str(); + } + + TString FormatIsNull(const TPredicate_TIsNull& isNull) { + auto statement = FormatExpression(isNull.value()); + return "(" + statement + " IS NULL)"; + } + + TString FormatIsNotNull(const TPredicate_TIsNotNull& isNotNull) { + auto statement = FormatExpression(isNotNull.value()); + return "(" + statement + " IS NOT NULL)"; + } + + TString FormatComparison(TPredicate_TComparison comparison) { + TString operation; + + switch (comparison.operation()) { + case TPredicate_TComparison::L: + operation = " < "; + break; + case TPredicate_TComparison::LE: + operation = " <= "; + break; + case TPredicate_TComparison::EQ: + operation = " = "; + break; + case TPredicate_TComparison::NE: + operation = " <> "; + break; + case TPredicate_TComparison::GE: + operation = " >= "; + break; + case TPredicate_TComparison::G: + operation = " > "; + break; + default: + throw yexception() << "UnimplementedOperation, operation " << static_cast(comparison.operation()); + } + auto left = FormatExpression(comparison.left_value()); + auto right = FormatExpression(comparison.right_value()); + + return left + operation + right; + } + + TString FormatPredicate(const TPredicate& predicate, bool topLevel ) { + switch (predicate.payload_case()) { + case TPredicate::PAYLOAD_NOT_SET: + return {}; + case TPredicate::kNegation: + return FormatNegation(predicate.negation()); + case TPredicate::kConjunction: + return FormatConjunction(predicate.conjunction(), topLevel); + case TPredicate::kDisjunction: + return FormatDisjunction(predicate.disjunction()); + case TPredicate::kIsNull: + return FormatIsNull(predicate.is_null()); + case TPredicate::kIsNotNull: + return FormatIsNotNull(predicate.is_not_null()); + case TPredicate::kComparison: + return FormatComparison(predicate.comparison()); + case TPredicate::kBoolExpression: + return FormatExpression(predicate.bool_expression().value()); + default: + throw yexception() << "UnimplementedPredicateType, payload_case " << static_cast(predicate.payload_case()); + } } bool IsEmptyFilterPredicate(const TCoLambda& lambda) { @@ -200,4 +427,11 @@ namespace NYql { return SerializePredicate(predicate.Body(), proto, predicate.Args().Arg(0), err); } + TString FormatWhere(const TPredicate& predicate) { + auto stream = FormatPredicate(predicate, true); + if (stream.empty()) { + return ""; + } + return "WHERE " + stream; + } } // namespace NYql diff --git a/ydb/library/yql/providers/generic/provider/yql_generic_predicate_pushdown.h b/ydb/library/yql/providers/generic/provider/yql_generic_predicate_pushdown.h index 121ab5052783..b798e483b8a5 100644 --- a/ydb/library/yql/providers/generic/provider/yql_generic_predicate_pushdown.h +++ b/ydb/library/yql/providers/generic/provider/yql_generic_predicate_pushdown.h @@ -10,5 +10,5 @@ namespace NYql { bool IsEmptyFilterPredicate(const NNodes::TCoLambda& lambda); bool SerializeFilterPredicate(const NNodes::TCoLambda& predicate, NConnector::NApi::TPredicate* proto, TStringBuilder& err); - + TString FormatWhere(const NConnector::NApi::TPredicate& predicate); } // namespace NYql diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp b/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp new file mode 100644 index 000000000000..74fdb0949230 --- /dev/null +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp @@ -0,0 +1,697 @@ +#include "dq_pq_rd_read_actor.h" +#include "probes.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#define SRC_LOG_T(s) \ + LOG_TRACE_S(*NActors::TlsActivationContext, NKikimrServices::KQP_COMPUTE, LogPrefix << s) +#define SRC_LOG_D(s) \ + LOG_DEBUG_S(*NActors::TlsActivationContext, NKikimrServices::KQP_COMPUTE, LogPrefix << s) +#define SRC_LOG_I(s) \ + LOG_INFO_S(*NActors::TlsActivationContext, NKikimrServices::KQP_COMPUTE, LogPrefix << s) +#define SRC_LOG_W(s) \ + LOG_WARN_S(*NActors::TlsActivationContext, NKikimrServices::KQP_COMPUTE, LogPrefix << s) +#define SRC_LOG_N(s) \ + LOG_NOTICE_S(*NActors::TlsActivationContext, NKikimrServices::KQP_COMPUTE, LogPrefix << s) +#define SRC_LOG_E(s) \ + LOG_ERROR_S(*NActors::TlsActivationContext, NKikimrServices::KQP_COMPUTE, LogPrefix << s) +#define SRC_LOG_C(s) \ + LOG_CRIT_S(*NActors::TlsActivationContext, NKikimrServices::KQP_COMPUTE, LogPrefix << s) +#define SRC_LOG(prio, s) \ + LOG_LOG_S(*NActors::TlsActivationContext, prio, NKikimrServices::KQP_COMPUTE, LogPrefix << s) + +namespace NYql::NDq { + +using namespace NActors; +using namespace NLog; +using namespace NKikimr::NMiniKQL; + +namespace { + +LWTRACE_USING(DQ_PQ_PROVIDER); + +} // namespace + +struct TRowDispatcherReadActorMetrics { + explicit TRowDispatcherReadActorMetrics(const TTxId& txId, ui64 taskId, const ::NMonitoring::TDynamicCounterPtr& counters) + : TxId(std::visit([](auto arg) { return ToString(arg); }, txId)) + , Counters(counters) { + SubGroup = Counters->GetSubgroup("sink", "RdPqRead"); + auto sink = SubGroup->GetSubgroup("tx_id", TxId); + auto task = sink->GetSubgroup("task_id", ToString(taskId)); + InFlyGetNextBatch = task->GetCounter("InFlyGetNextBatch"); + } + + ~TRowDispatcherReadActorMetrics() { + SubGroup->RemoveSubgroup("id", TxId); + } + + TString TxId; + ::NMonitoring::TDynamicCounterPtr Counters; + ::NMonitoring::TDynamicCounterPtr SubGroup; + ::NMonitoring::TDynamicCounters::TCounterPtr InFlyGetNextBatch; +}; + +struct TEvPrivate { + enum EEv : ui32 { + EvBegin = EventSpaceBegin(NActors::TEvents::ES_PRIVATE), + EvPrintState = EvBegin + 20, + EvEnd + }; + static_assert(EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE), "expect EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE)"); + struct TEvPrintState : public NActors::TEventLocal {}; +}; + +ui64 PrintStatePeriodSec = 60; + +class TDqPqRdReadActor : public NActors::TActor, public NYql::NDq::NInternal::TDqPqReadActorBase { +public: + using TDebugOffsets = TMaybe>; + + struct TReadyBatch { + public: + TReadyBatch(ui64 partitionId, ui32 dataCapacity) + : PartitionId(partitionId) { + Data.reserve(dataCapacity); + } + + public: + TVector Data; + i64 UsedSpace = 0; + ui64 NextOffset = 0; + ui64 PartitionId; + }; + + enum class EState { + INIT, + WAIT_COORDINATOR_ID, + WAIT_PARTITIONS_ADDRES, + STARTED + }; +private: + std::vector> MetadataFields; + const TString Token; + TMaybe CoordinatorActorId; + NActors::TActorId LocalRowDispatcherActorId; + std::queue ReadyBuffer; + EState State = EState::INIT; + ui64 CoordinatorRequestCookie = 0; + TRowDispatcherReadActorMetrics Metrics; + bool SchedulePrintStatePeriod = false; + + struct SessionInfo { + enum class ESessionStatus { + NoSession, + Started, + }; + SessionInfo( + const TTxId& txId, + const NActors::TActorId selfId, + TActorId rowDispatcherActorId, + ui64 eventQueueId) + : RowDispatcherActorId(rowDispatcherActorId) { + EventsQueue.Init(txId, selfId, selfId, eventQueueId, /* KeepAlive */ true); + EventsQueue.OnNewRecipientId(rowDispatcherActorId); + } + + ESessionStatus Status = ESessionStatus::NoSession; + ui64 NextOffset = 0; + bool IsWaitingRowDispatcherResponse = false; + NYql::NDq::TRetryEventsQueue EventsQueue; + bool NewDataArrived = false; + TActorId RowDispatcherActorId; + }; + + TMap Sessions; + +public: + TDqPqRdReadActor( + ui64 inputIndex, + TCollectStatsLevel statsLevel, + const TTxId& txId, + ui64 taskId, + const THolderFactory& holderFactory, + NPq::NProto::TDqPqTopicSource&& sourceParams, + NPq::NProto::TDqReadTaskParams&& readParams, + const NActors::TActorId& computeActorId, + const NActors::TActorId& localRowDispatcherActorId, + const TString& token, + const ::NMonitoring::TDynamicCounterPtr& counters); + + void Handle(NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvCoordinatorResult::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvStartSessionAck::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvNewDataArrived::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvSessionError::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvStatus::TPtr& ev); + + void HandleDisconnected(TEvInterconnect::TEvNodeDisconnected::TPtr& ev); + void HandleConnected(TEvInterconnect::TEvNodeConnected::TPtr& ev); + void Handle(NActors::TEvents::TEvUndelivered::TPtr& ev); + void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvRetry::TPtr&); + void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvPing::TPtr&); + void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed::TPtr&); + void Handle(NActors::TEvents::TEvPong::TPtr& ev); + void Handle(const NActors::TEvents::TEvPing::TPtr&); + void Handle(TEvPrivate::TEvPrintState::TPtr&); + + STRICT_STFUNC(StateFunc, { + hFunc(NFq::TEvRowDispatcher::TEvCoordinatorChanged, Handle); + hFunc(NFq::TEvRowDispatcher::TEvCoordinatorResult, Handle); + hFunc(NFq::TEvRowDispatcher::TEvNewDataArrived, Handle); + hFunc(NFq::TEvRowDispatcher::TEvMessageBatch, Handle); + hFunc(NFq::TEvRowDispatcher::TEvStartSessionAck, Handle); + hFunc(NFq::TEvRowDispatcher::TEvSessionError, Handle); + hFunc(NFq::TEvRowDispatcher::TEvStatus, Handle); + + hFunc(NActors::TEvents::TEvPong, Handle); + hFunc(TEvInterconnect::TEvNodeConnected, HandleConnected); + hFunc(TEvInterconnect::TEvNodeDisconnected, HandleDisconnected); + hFunc(NActors::TEvents::TEvUndelivered, Handle); + hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvRetry, Handle); + hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvPing, Handle); + hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed, Handle); + hFunc(NActors::TEvents::TEvPing, Handle); + hFunc(TEvPrivate::TEvPrintState, Handle); + }) + + static constexpr char ActorName[] = "DQ_PQ_READ_ACTOR"; + + void CommitState(const NDqProto::TCheckpoint& checkpoint) override; + void PassAway() override; + i64 GetAsyncInputData(NKikimr::NMiniKQL::TUnboxedValueBatch& buffer, TMaybe& watermark, bool&, i64 freeSpace) override; + std::vector GetPartitionsToRead() const; + std::pair CreateItem(const TString& data); + void ProcessState(); + void Stop(const TString& message); + void StopSessions(); + void ReInit(); + void PrintInternalState(); +}; + +TDqPqRdReadActor::TDqPqRdReadActor( + ui64 inputIndex, + TCollectStatsLevel statsLevel, + const TTxId& txId, + ui64 taskId, + const THolderFactory& /*holderFactory*/, + NPq::NProto::TDqPqTopicSource&& sourceParams, + NPq::NProto::TDqReadTaskParams&& readParams, + const NActors::TActorId& computeActorId, + const NActors::TActorId& localRowDispatcherActorId, + const TString& token, + const ::NMonitoring::TDynamicCounterPtr& counters) + : TActor(&TDqPqRdReadActor::StateFunc) + , TDqPqReadActorBase(inputIndex, taskId, this->SelfId(), txId, std::move(sourceParams), std::move(readParams), computeActorId) + , Token(token) + , LocalRowDispatcherActorId(localRowDispatcherActorId) + , Metrics(txId, taskId, counters) +{ + MetadataFields.reserve(SourceParams.MetadataFieldsSize()); + TPqMetaExtractor fieldsExtractor; + for (const auto& fieldName : SourceParams.GetMetadataFields()) { + MetadataFields.emplace_back(fieldName, fieldsExtractor.FindExtractorLambda(fieldName)); + } + + IngressStats.Level = statsLevel; + SRC_LOG_D("Start read actor, local row dispatcher " << LocalRowDispatcherActorId.ToString()); +} + +void TDqPqRdReadActor::ProcessState() { + switch (State) { + case EState::INIT: + if (!ReadyBuffer.empty()) { + return; + } + if (!CoordinatorActorId) { + SRC_LOG_D("Send TEvCoordinatorChangesSubscribe to local row dispatcher, self id " << SelfId()); + Send(LocalRowDispatcherActorId, new NFq::TEvRowDispatcher::TEvCoordinatorChangesSubscribe()); + if (!SchedulePrintStatePeriod) { + SchedulePrintStatePeriod = true; + Schedule(TDuration::Seconds(PrintStatePeriodSec), new TEvPrivate::TEvPrintState()); + } + } + State = EState::WAIT_COORDINATOR_ID; + [[fallthrough]]; + case EState::WAIT_COORDINATOR_ID: { + if (!CoordinatorActorId) { + return; + } + State = EState::WAIT_PARTITIONS_ADDRES; + auto partitionToRead = GetPartitionsToRead(); + SRC_LOG_D("Send TEvCoordinatorRequest to coordinator " << CoordinatorActorId->ToString() << ", partIds: " << JoinSeq(", ", partitionToRead)); + Send( + *CoordinatorActorId, + new NFq::TEvRowDispatcher::TEvCoordinatorRequest(SourceParams, partitionToRead), + IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession, + ++CoordinatorRequestCookie); + return; + } + case EState::WAIT_PARTITIONS_ADDRES: + if (Sessions.empty()) { + return; + } + + for (auto& [partitionId, sessionInfo] : Sessions) { + if (sessionInfo.Status == SessionInfo::ESessionStatus::NoSession) { + TMaybe readOffset; + TPartitionKey partitionKey{TString{}, partitionId}; + const auto offsetIt = PartitionToOffset.find(partitionKey); + if (offsetIt != PartitionToOffset.end()) { + SRC_LOG_D("readOffset found" ); + readOffset = offsetIt->second; + } + + SRC_LOG_D("Send TEvStartSession to " << sessionInfo.RowDispatcherActorId + << ", offset " << readOffset + << ", partitionId " << partitionId); + + auto event = new NFq::TEvRowDispatcher::TEvStartSession( + SourceParams, + partitionId, + Token, + readOffset, + StartingMessageTimestamp.MilliSeconds(), + std::visit([](auto arg) { return ToString(arg); }, TxId)); + sessionInfo.EventsQueue.Send(event); + sessionInfo.IsWaitingRowDispatcherResponse = true; + sessionInfo.Status = SessionInfo::ESessionStatus::Started; + } + } + State = EState::STARTED; + return; + case EState::STARTED: + return; + } +} + + +void TDqPqRdReadActor::CommitState(const NDqProto::TCheckpoint& /*checkpoint*/) { +} + +void TDqPqRdReadActor::StopSessions() { + SRC_LOG_I("Stop all session"); + for (auto& [partitionId, sessionInfo] : Sessions) { + if (sessionInfo.Status == SessionInfo::ESessionStatus::NoSession) { + continue; + } + auto event = std::make_unique(); + *event->Record.MutableSource() = SourceParams; + event->Record.SetPartitionId(partitionId); + SRC_LOG_D("Send StopSession to " << sessionInfo.RowDispatcherActorId); + sessionInfo.EventsQueue.Send(event.release()); + } +} + +// IActor & IDqComputeActorAsyncInput +void TDqPqRdReadActor::PassAway() { // Is called from Compute Actor + SRC_LOG_D("PassAway"); + PrintInternalState(); + StopSessions(); + TActor::PassAway(); + + // TODO: RetryQueue::Unsubscribe() +} + +i64 TDqPqRdReadActor::GetAsyncInputData(NKikimr::NMiniKQL::TUnboxedValueBatch& buffer, TMaybe& /*watermark*/, bool&, i64 freeSpace) { + SRC_LOG_T("GetAsyncInputData freeSpace = " << freeSpace); + + ProcessState(); + if (ReadyBuffer.empty() || !freeSpace) { + return 0; + } + i64 usedSpace = 0; + buffer.clear(); + do { + auto& readyBatch = ReadyBuffer.front(); + SRC_LOG_T("Return " << readyBatch.Data.size() << " items"); + + for (const auto& message : readyBatch.Data) { + auto [item, size] = CreateItem(message); + buffer.push_back(std::move(item)); + } + usedSpace += readyBatch.UsedSpace; + freeSpace -= readyBatch.UsedSpace; + SRC_LOG_T("usedSpace " << usedSpace); + SRC_LOG_T("freeSpace " << freeSpace); + + TPartitionKey partitionKey{TString{}, readyBatch.PartitionId}; + PartitionToOffset[partitionKey] = readyBatch.NextOffset; + SRC_LOG_T("NextOffset " << readyBatch.NextOffset); + ReadyBuffer.pop(); + } while (freeSpace > 0 && !ReadyBuffer.empty()); + + ProcessState(); + return usedSpace; +} + +std::vector TDqPqRdReadActor::GetPartitionsToRead() const { + std::vector res; + + ui64 currentPartition = ReadParams.GetPartitioningParams().GetEachTopicPartitionGroupId(); + do { + res.emplace_back(currentPartition); // 0-based in topic API + currentPartition += ReadParams.GetPartitioningParams().GetDqPartitionsCount(); + } while (currentPartition < ReadParams.GetPartitioningParams().GetTopicPartitionsCount()); + return res; +} + +void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvStartSessionAck::TPtr& ev) { + const NYql::NDqProto::TMessageTransportMeta& meta = ev->Get()->Record.GetTransportMeta(); + SRC_LOG_D("TEvStartSessionAck from " << ev->Sender << ", seqNo " << meta.GetSeqNo() << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo()); + + ui64 partitionId = ev->Get()->Record.GetConsumer().GetPartitionId(); + auto sessionIt = Sessions.find(partitionId); + YQL_ENSURE(sessionIt != Sessions.end(), "Unknown partition id"); + auto& sessionInfo = sessionIt->second; + if (!sessionInfo.EventsQueue.OnEventReceived(ev)) { + SRC_LOG_W("Wrong seq num ignore message, seqNo " << meta.GetSeqNo()); + return; + } +} + +void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvSessionError::TPtr& ev) { + const NYql::NDqProto::TMessageTransportMeta& meta = ev->Get()->Record.GetTransportMeta(); + SRC_LOG_D("TEvSessionError from " << ev->Sender << ", seqNo " << meta.GetSeqNo() << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo()); + + ui64 partitionId = ev->Get()->Record.GetPartitionId(); + auto sessionIt = Sessions.find(partitionId); + YQL_ENSURE(sessionIt != Sessions.end(), "Unknown partition id"); + + auto& sessionInfo = sessionIt->second; + if (!sessionInfo.EventsQueue.OnEventReceived(ev)) { + SRC_LOG_W("Wrong seq num ignore message, seqNo " << meta.GetSeqNo()); + return; + } + Stop(ev->Get()->Record.GetMessage()); +} + +void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvStatus::TPtr& ev) { + const NYql::NDqProto::TMessageTransportMeta& meta = ev->Get()->Record.GetTransportMeta(); + SRC_LOG_D("TEvStatus from " << ev->Sender << ", offset " << ev->Get()->Record.GetNextMessageOffset() << ", seqNo " << meta.GetSeqNo() << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo()); + + ui64 partitionId = ev->Get()->Record.GetPartitionId(); + auto sessionIt = Sessions.find(partitionId); + YQL_ENSURE(sessionIt != Sessions.end(), "Unknown partition id"); + auto& sessionInfo = sessionIt->second; + + if (!sessionInfo.EventsQueue.OnEventReceived(ev)) { + SRC_LOG_W("Wrong seq num ignore message, seqNo " << meta.GetSeqNo()); + return; + } + + if (ReadyBuffer.empty()) { + TPartitionKey partitionKey{TString{}, partitionId}; + PartitionToOffset[partitionKey] = ev->Get()->Record.GetNextMessageOffset(); + } +} + +void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvNewDataArrived::TPtr& ev) { + const NYql::NDqProto::TMessageTransportMeta& meta = ev->Get()->Record.GetTransportMeta(); + SRC_LOG_T("TEvNewDataArrived from " << ev->Sender << ", part id " << ev->Get()->Record.GetPartitionId() << ", seqNo " << meta.GetSeqNo() << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo()); + + ui64 partitionId = ev->Get()->Record.GetPartitionId(); + auto sessionIt = Sessions.find(partitionId); + if (sessionIt == Sessions.end()) { + Stop("Internal error: unknown partition id " + ToString(partitionId)); + return; + } + + auto& sessionInfo = sessionIt->second; + if (!sessionInfo.EventsQueue.OnEventReceived(ev)) { + SRC_LOG_W("Wrong seq num ignore message, seqNo " << meta.GetSeqNo()); + return; + } + sessionInfo.NewDataArrived = true; + Metrics.InFlyGetNextBatch->Inc(); + auto event = std::make_unique(); + event->Record.SetPartitionId(partitionId); + sessionInfo.EventsQueue.Send(event.release()); +} + +void TDqPqRdReadActor::Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvRetry::TPtr& ev) { + SRC_LOG_D("TEvRetry"); + ui64 partitionId = ev->Get()->EventQueueId; + + auto sessionIt = Sessions.find(partitionId); + if (sessionIt == Sessions.end()) { + SRC_LOG_W("Unknown partition id " << partitionId << ", skip TEvRetry"); + return; + } + sessionIt->second.EventsQueue.Retry(); +} + +void TDqPqRdReadActor::Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvPing::TPtr& ev) { + SRC_LOG_T("TEvRetryQueuePrivate::TEvPing"); + ui64 partitionId = ev->Get()->EventQueueId; + + auto sessionIt = Sessions.find(partitionId); + if (sessionIt == Sessions.end()) { + SRC_LOG_W("Unknown partition id " << partitionId << ", skip TEvPing"); + return; + } + sessionIt->second.EventsQueue.Ping(); +} + +void TDqPqRdReadActor::Handle(const NActors::TEvents::TEvPing::TPtr& ev) { + SRC_LOG_T("NActors::TEvents::TEvPing"); + Send(ev->Sender, new NActors::TEvents::TEvPong()); +} + +void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr& ev) { + SRC_LOG_D("TEvCoordinatorChanged, new coordinator " << ev->Get()->CoordinatorActorId); + + if (CoordinatorActorId + && CoordinatorActorId == ev->Get()->CoordinatorActorId) { + return; + } + + if (!CoordinatorActorId) { + CoordinatorActorId = ev->Get()->CoordinatorActorId; + ProcessState(); + return; + } + + CoordinatorActorId = ev->Get()->CoordinatorActorId; + SRC_LOG_I("Coordinator is changed, reinit all sessions"); + ReInit(); + ProcessState(); +} + +void TDqPqRdReadActor::ReInit() { + SRC_LOG_I("ReInit state"); + StopSessions(); + Sessions.clear(); + State = EState::INIT; + if (!ReadyBuffer.empty()) { + Send(ComputeActorId, new TEvNewAsyncInputDataArrived(InputIndex)); + } + ProcessState(); +} + +void TDqPqRdReadActor::Stop(const TString& message) { + NYql::TIssues issues; + issues.AddIssue(NYql::TIssue{message}); + SRC_LOG_E("Stop read actor, error: " << message); + Send(ComputeActorId, new TEvAsyncInputError(InputIndex, issues, NYql::NDqProto::StatusIds::BAD_REQUEST)); // TODO: use UNAVAILABLE ? +} + +void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvCoordinatorResult::TPtr& ev) { + SRC_LOG_D("TEvCoordinatorResult from " << ev->Sender.ToString() << ", cookie " << ev->Cookie); + if (ev->Cookie != CoordinatorRequestCookie) { + SRC_LOG_W("Ignore TEvCoordinatorResult. wrong cookie"); + return; + } + for (auto& p : ev->Get()->Record.GetPartitions()) { + TActorId rowDispatcherActorId = ActorIdFromProto(p.GetActorId()); + SRC_LOG_D(" rowDispatcherActorId:" << rowDispatcherActorId); + + for (auto partitionId : p.GetPartitionId()) { + SRC_LOG_D(" partitionId:" << partitionId); + if (!Sessions.contains(partitionId)) { // TODO + Sessions.emplace( + std::piecewise_construct, + std::forward_as_tuple(partitionId), + std::forward_as_tuple(TxId, SelfId(), rowDispatcherActorId, partitionId)); + } + } + } + ProcessState(); +} + +void TDqPqRdReadActor::HandleConnected(TEvInterconnect::TEvNodeConnected::TPtr& ev) { + SRC_LOG_D("EvNodeConnected " << ev->Get()->NodeId); + for (auto& [partitionId, sessionInfo] : Sessions) { + sessionInfo.EventsQueue.HandleNodeConnected(ev->Get()->NodeId); + } +} + +void TDqPqRdReadActor::HandleDisconnected(TEvInterconnect::TEvNodeDisconnected::TPtr& ev) { + SRC_LOG_D("TEvNodeDisconnected, node id " << ev->Get()->NodeId); + for (auto& [partitionId, sessionInfo] : Sessions) { + sessionInfo.EventsQueue.HandleNodeDisconnected(ev->Get()->NodeId); + } + // In case of row dispatcher disconnection: wait connected or SessionClosed(). TODO: Stop actor after timeout. + // In case of row dispatcher disconnection: wait CoordinatorChanged(). + //Stop(TString{"Node disconnected, nodeId "} + ToString(ev->Get()->NodeId)); +} + +void TDqPqRdReadActor::Handle(NActors::TEvents::TEvUndelivered::TPtr& ev) { + SRC_LOG_D("TEvUndelivered, " << ev->Get()->ToString() << " from " << ev->Sender.ToString()); + for (auto& [partitionId, sessionInfo] : Sessions) { + sessionInfo.EventsQueue.HandleUndelivered(ev); + } + + if (CoordinatorActorId && *CoordinatorActorId == ev->Sender) { + SRC_LOG_D("TEvUndelivered to coordinator, reinit"); + ReInit(); + } +} + +void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev) { + const NYql::NDqProto::TMessageTransportMeta& meta = ev->Get()->Record.GetTransportMeta(); + SRC_LOG_T("TEvMessageBatch from " << ev->Sender << ", seqNo " << meta.GetSeqNo() << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo()); + ui64 partitionId = ev->Get()->Record.GetPartitionId(); + YQL_ENSURE(Sessions.count(partitionId), "Unknown partition id"); + auto it = Sessions.find(partitionId); + if (it == Sessions.end()) { + Stop("Wrong session data"); + return; + } + + Metrics.InFlyGetNextBatch->Dec(); + auto& sessionInfo = it->second; + if (!sessionInfo.EventsQueue.OnEventReceived(ev)) { + SRC_LOG_W("Wrong seq num ignore message, seqNo " << meta.GetSeqNo()); + return; + } + ReadyBuffer.emplace(partitionId, ev->Get()->Record.MessagesSize()); + TReadyBatch& activeBatch = ReadyBuffer.back(); + + ui64 bytes = 0; + for (const auto& message : ev->Get()->Record.GetMessages()) { + SRC_LOG_T("Json: " << message.GetJson()); + activeBatch.Data.emplace_back(message.GetJson()); + activeBatch.UsedSpace += message.GetJson().size(); + sessionInfo.NextOffset = message.GetOffset() + 1; + bytes += message.GetJson().size(); + SRC_LOG_T("TEvMessageBatch NextOffset " << sessionInfo.NextOffset); + } + IngressStats.Bytes += bytes; + IngressStats.Chunks++; + activeBatch.NextOffset = ev->Get()->Record.GetNextMessageOffset(); + Send(ComputeActorId, new TEvNewAsyncInputDataArrived(InputIndex)); +} + +std::pair TDqPqRdReadActor::CreateItem(const TString& data) { + i64 usedSpace = 0; + NUdf::TUnboxedValuePod item; + item = NKikimr::NMiniKQL::MakeString(NUdf::TStringRef(data.Data(), data.Size())); + usedSpace += data.Size(); + return std::make_pair(item, usedSpace); +} + +void TDqPqRdReadActor::Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed::TPtr& ev) { + SRC_LOG_D("Session closed, event queue id " << ev->Get()->EventQueueId); + ReInit(); +} + +void TDqPqRdReadActor::Handle(NActors::TEvents::TEvPong::TPtr& ev) { + SRC_LOG_T("TEvPong from " << ev->Sender); +} + +void TDqPqRdReadActor::Handle(TEvPrivate::TEvPrintState::TPtr&) { + Schedule(TDuration::Seconds(PrintStatePeriodSec), new TEvPrivate::TEvPrintState()); + PrintInternalState(); +} + +void TDqPqRdReadActor::PrintInternalState() { + TStringStream str; + str << "State:\n"; + for (auto& [partitionId, sessionInfo] : Sessions) { + str << " partId " << partitionId << " "; + sessionInfo.EventsQueue.PrintInternalState(str); + } + SRC_LOG_D(str.Str()); +} + +std::pair CreateDqPqRdReadActor( + NPq::NProto::TDqPqTopicSource&& settings, + ui64 inputIndex, + TCollectStatsLevel statsLevel, + TTxId txId, + ui64 taskId, + const THashMap& secureParams, + const THashMap& taskParams, + const NActors::TActorId& computeActorId, + const NActors::TActorId& localRowDispatcherActorId, + const NKikimr::NMiniKQL::THolderFactory& holderFactory, + const ::NMonitoring::TDynamicCounterPtr& counters, + i64 /*bufferSize*/) // TODO +{ + auto taskParamsIt = taskParams.find("pq"); + YQL_ENSURE(taskParamsIt != taskParams.end(), "Failed to get pq task params"); + + NPq::NProto::TDqReadTaskParams readTaskParamsMsg; + YQL_ENSURE(readTaskParamsMsg.ParseFromString(taskParamsIt->second), "Failed to parse DqPqRead task params"); + + const TString& tokenName = settings.GetToken().GetName(); + const TString token = secureParams.Value(tokenName, TString()); + + TDqPqRdReadActor* actor = new TDqPqRdReadActor( + inputIndex, + statsLevel, + txId, + taskId, + holderFactory, + std::move(settings), + std::move(readTaskParamsMsg), + computeActorId, + localRowDispatcherActorId, + token, + counters + ); + + return {actor, actor}; +} + +} // namespace NYql::NDq diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.h b/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.h new file mode 100644 index 000000000000..d1131fd7a76e --- /dev/null +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include + +#include +#include + +#include +#include + +#include + +#include + +#include +#include +#include + +namespace NYql::NDq { +class TDqAsyncIoFactory; + +const i64 PQRdReadDefaultFreeSpace = 16_MB; + +std::pair CreateDqPqRdReadActor( + NPq::NProto::TDqPqTopicSource&& settings, + ui64 inputIndex, + TCollectStatsLevel statsLevel, + TTxId txId, + ui64 taskId, + const THashMap& secureParams, + const THashMap& taskParams, + const NActors::TActorId& computeActorId, + const NActors::TActorId& localRowDispatcherActorId, + const NKikimr::NMiniKQL::THolderFactory& holderFactory, + const ::NMonitoring::TDynamicCounterPtr& counters, + i64 bufferSize = PQRdReadDefaultFreeSpace); + +} // namespace NYql::NDq diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp index 112187cd72ba..b57f80c8478c 100644 --- a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -28,11 +30,14 @@ #include #include +#include + #include #include #include #include + #include #include @@ -59,8 +64,6 @@ using namespace NActors; using namespace NLog; using namespace NKikimr::NMiniKQL; -constexpr ui32 StateVersion = 1; - namespace { LWTRACE_USING(DQ_PQ_PROVIDER); @@ -84,7 +87,7 @@ struct TEvPrivate { } // namespace -class TDqPqReadActor : public NActors::TActor, public IDqComputeActorAsyncInput { +class TDqPqReadActor : public NActors::TActor, public NYql::NDq::NInternal::TDqPqReadActorBase { struct TMetrics { TMetrics(const TTxId& txId, ui64 taskId, const ::NMonitoring::TDynamicCounterPtr& counters) : TxId(std::visit([](auto arg) { return ToString(arg); }, txId)) @@ -127,18 +130,12 @@ class TDqPqReadActor : public NActors::TActor, public IDqCompute const ::NMonitoring::TDynamicCounterPtr& counters, i64 bufferSize) : TActor(&TDqPqReadActor::StateFunc) - , InputIndex(inputIndex) - , TxId(txId) + , TDqPqReadActorBase(inputIndex, taskId, this->SelfId(), txId, std::move(sourceParams), std::move(readParams), computeActorId) , Metrics(txId, taskId, counters) , BufferSize(bufferSize) , HolderFactory(holderFactory) - , LogPrefix(TStringBuilder() << "SelfId: " << this->SelfId() << ", TxId: " << TxId << ", task: " << taskId << ". PQ source. ") , Driver(std::move(driver)) , CredentialsProviderFactory(std::move(credentialsProviderFactory)) - , SourceParams(std::move(sourceParams)) - , ReadParams(std::move(readParams)) - , StartingMessageTimestamp(TInstant::MilliSeconds(TInstant::Now().MilliSeconds())) // this field is serialized as milliseconds, so drop microseconds part to be consistent with storage - , ComputeActorId(computeActorId) { MetadataFields.reserve(SourceParams.MetadataFieldsSize()); TPqMetaExtractor fieldsExtractor; @@ -164,64 +161,13 @@ class TDqPqReadActor : public NActors::TActor, public IDqCompute public: void SaveState(const NDqProto::TCheckpoint& checkpoint, TSourceState& state) override { - NPq::NProto::TDqPqTopicSourceState stateProto; - - NPq::NProto::TDqPqTopicSourceState::TTopicDescription* topic = stateProto.AddTopics(); - topic->SetDatabaseId(SourceParams.GetDatabaseId()); - topic->SetEndpoint(SourceParams.GetEndpoint()); - topic->SetDatabase(SourceParams.GetDatabase()); - topic->SetTopicPath(SourceParams.GetTopicPath()); - - for (const auto& [clusterAndPartition, offset] : PartitionToOffset) { - const auto& [cluster, partition] = clusterAndPartition; - NPq::NProto::TDqPqTopicSourceState::TPartitionReadState* partitionState = stateProto.AddPartitions(); - partitionState->SetTopicIndex(0); // Now we are supporting only one topic per source. - partitionState->SetCluster(cluster); - partitionState->SetPartition(partition); - partitionState->SetOffset(offset); - } - - stateProto.SetStartingMessageTimestampMs(StartingMessageTimestamp.MilliSeconds()); - stateProto.SetIngressBytes(IngressStats.Bytes); - - TString stateBlob; - YQL_ENSURE(stateProto.SerializeToString(&stateBlob)); - - state.Data.emplace_back(stateBlob, StateVersion); - + TDqPqReadActorBase::SaveState(checkpoint, state); DeferredCommits.emplace(checkpoint.GetId(), std::move(CurrentDeferredCommit)); CurrentDeferredCommit = NYdb::NTopic::TDeferredCommit(); } void LoadState(const TSourceState& state) override { - TInstant minStartingMessageTs = state.DataSize() ? TInstant::Max() : StartingMessageTimestamp; - ui64 ingressBytes = 0; - for (const auto& data : state.Data) { - if (data.Version == StateVersion) { // Current version - NPq::NProto::TDqPqTopicSourceState stateProto; - YQL_ENSURE(stateProto.ParseFromString(data.Blob), "Serialized state is corrupted"); - YQL_ENSURE(stateProto.TopicsSize() == 1, "One topic per source is expected"); - PartitionToOffset.reserve(PartitionToOffset.size() + stateProto.PartitionsSize()); - for (const NPq::NProto::TDqPqTopicSourceState::TPartitionReadState& partitionProto : stateProto.GetPartitions()) { - ui64& offset = PartitionToOffset[TPartitionKey{partitionProto.GetCluster(), partitionProto.GetPartition()}]; - if (offset) { - offset = Min(offset, partitionProto.GetOffset()); - } else { - offset = partitionProto.GetOffset(); - } - } - minStartingMessageTs = Min(minStartingMessageTs, TInstant::MilliSeconds(stateProto.GetStartingMessageTimestampMs())); - ingressBytes += stateProto.GetIngressBytes(); - } else { - ythrow yexception() << "Invalid state version " << data.Version; - } - } - for (const auto& [key, value] : PartitionToOffset) { - SRC_LOG_D("SessionId: " << GetSessionId() << " Restoring offset: cluster " << key.first << ", partition id " << key.second << ", offset: " << value); - } - StartingMessageTimestamp = minStartingMessageTs; - IngressStats.Bytes += ingressBytes; - IngressStats.Chunks++; + TDqPqReadActorBase::LoadState(state); InitWatermarkTracker(); if (ReadSession) { @@ -239,14 +185,6 @@ class TDqPqReadActor : public NActors::TActor, public IDqCompute } } - ui64 GetInputIndex() const override { - return InputIndex; - } - - const TDqAsyncStats& GetIngressStats() const override { - return IngressStats; - } - NYdb::NTopic::TTopicClient& GetTopicClient() { if (!TopicClient) { TopicClient = std::make_unique(Driver, GetTopicClientSettings()); @@ -262,7 +200,7 @@ class TDqPqReadActor : public NActors::TActor, public IDqCompute return *ReadSession; } - TString GetSessionId() const { + TString GetSessionId() const override { return ReadSession ? ReadSession->GetSessionId() : TString{"empty"}; } @@ -625,23 +563,14 @@ class TDqPqReadActor : public NActors::TActor, public IDqCompute }; private: - const ui64 InputIndex; - TDqAsyncStats IngressStats; - const TTxId TxId; TMetrics Metrics; const i64 BufferSize; const THolderFactory& HolderFactory; - const TString LogPrefix; NYdb::TDriver Driver; std::shared_ptr CredentialsProviderFactory; - const NPq::NProto::TDqPqTopicSource SourceParams; - const NPq::NProto::TDqReadTaskParams ReadParams; std::unique_ptr TopicClient; std::shared_ptr ReadSession; NThreading::TFuture EventFuture; - THashMap PartitionToOffset; // {cluster, partition} -> offset of next event. - TInstant StartingMessageTimestamp; - const NActors::TActorId ComputeActorId; std::queue> DeferredCommits; NYdb::NTopic::TDeferredCommit CurrentDeferredCommit; bool SubscribedOnEvent = false; @@ -702,7 +631,25 @@ void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driv IDqAsyncIoFactory::TSourceArguments&& args) { NLwTraceMonPage::ProbeRegistry().AddProbesList(LWTRACE_GET_PROBES(DQ_PQ_PROVIDER)); - return CreateDqPqReadActor( + + if (!settings.GetSharedReading()) { + return CreateDqPqReadActor( + std::move(settings), + args.InputIndex, + args.StatsLevel, + args.TxId, + args.TaskId, + args.SecureParams, + args.TaskParams, + driver, + credentialsFactory, + args.ComputeActorId, + args.HolderFactory, + counters, + PQReadDefaultFreeSpace); + } + + return CreateDqPqRdReadActor( std::move(settings), args.InputIndex, args.StatsLevel, @@ -710,9 +657,8 @@ void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driv args.TaskId, args.SecureParams, args.TaskParams, - driver, - credentialsFactory, args.ComputeActorId, + NFq::RowDispatcherServiceActorId(), args.HolderFactory, counters, PQReadDefaultFreeSpace); diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor_base.cpp b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor_base.cpp new file mode 100644 index 000000000000..c1bc837f4cba --- /dev/null +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor_base.cpp @@ -0,0 +1,86 @@ +#include "dq_pq_read_actor.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +using namespace NYql::NDq::NInternal; + +constexpr ui32 StateVersion = 1; + +#define SRC_LOG_D(s) \ + LOG_DEBUG_S(*NActors::TlsActivationContext, NKikimrServices::KQP_COMPUTE, LogPrefix << s) + +void TDqPqReadActorBase::SaveState(const NDqProto::TCheckpoint& /*checkpoint*/, TSourceState& state) { + NPq::NProto::TDqPqTopicSourceState stateProto; + + NPq::NProto::TDqPqTopicSourceState::TTopicDescription* topic = stateProto.AddTopics(); + topic->SetDatabaseId(SourceParams.GetDatabaseId()); + topic->SetEndpoint(SourceParams.GetEndpoint()); + topic->SetDatabase(SourceParams.GetDatabase()); + topic->SetTopicPath(SourceParams.GetTopicPath()); + + for (const auto& [clusterAndPartition, offset] : PartitionToOffset) { + const auto& [cluster, partition] = clusterAndPartition; + NPq::NProto::TDqPqTopicSourceState::TPartitionReadState* partitionState = stateProto.AddPartitions(); + partitionState->SetTopicIndex(0); // Now we are supporting only one topic per source. + partitionState->SetCluster(cluster); + partitionState->SetPartition(partition); + partitionState->SetOffset(offset); + } + + stateProto.SetStartingMessageTimestampMs(StartingMessageTimestamp.MilliSeconds()); + stateProto.SetIngressBytes(IngressStats.Bytes); + + TString stateBlob; + YQL_ENSURE(stateProto.SerializeToString(&stateBlob)); + + state.Data.emplace_back(stateBlob, StateVersion); +} + +void TDqPqReadActorBase::LoadState(const TSourceState& state) { + TInstant minStartingMessageTs = state.DataSize() ? TInstant::Max() : StartingMessageTimestamp; + ui64 ingressBytes = 0; + for (const auto& data : state.Data) { + if (data.Version != StateVersion) { + ythrow yexception() << "Invalid state version, expected " << StateVersion << ", actual " << data.Version; + } + NPq::NProto::TDqPqTopicSourceState stateProto; + YQL_ENSURE(stateProto.ParseFromString(data.Blob), "Serialized state is corrupted"); + YQL_ENSURE(stateProto.TopicsSize() == 1, "One topic per source is expected"); + PartitionToOffset.reserve(PartitionToOffset.size() + stateProto.PartitionsSize()); + for (const NPq::NProto::TDqPqTopicSourceState::TPartitionReadState& partitionProto : stateProto.GetPartitions()) { + ui64& offset = PartitionToOffset[TPartitionKey{partitionProto.GetCluster(), partitionProto.GetPartition()}]; + if (offset) { + offset = Min(offset, partitionProto.GetOffset()); + } else { + offset = partitionProto.GetOffset(); + } + } + minStartingMessageTs = Min(minStartingMessageTs, TInstant::MilliSeconds(stateProto.GetStartingMessageTimestampMs())); + ingressBytes += stateProto.GetIngressBytes(); + } + for (const auto& [key, value] : PartitionToOffset) { + SRC_LOG_D("SessionId: " << GetSessionId() << " Restoring offset: cluster " << key.first << ", partition id " << key.second << ", offset: " << value); + } + StartingMessageTimestamp = minStartingMessageTs; + IngressStats.Bytes += ingressBytes; + IngressStats.Chunks++; +} + +ui64 TDqPqReadActorBase::GetInputIndex() const { + return InputIndex; +} + +const NYql::NDq::TDqAsyncStats& TDqPqReadActorBase::GetIngressStats() const { + return IngressStats; +} \ No newline at end of file diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor_base.h b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor_base.h new file mode 100644 index 000000000000..f00176b8ab96 --- /dev/null +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor_base.h @@ -0,0 +1,51 @@ +#pragma once + +#include + +namespace NYql::NDq::NInternal { + +class TDqPqReadActorBase : public IDqComputeActorAsyncInput { + +public: + using TPartitionKey = std::pair; // Cluster, partition id. + + const ui64 InputIndex; + THashMap PartitionToOffset; // {cluster, partition} -> offset of next event. + const TTxId TxId; + const NPq::NProto::TDqPqTopicSource SourceParams; + TDqAsyncStats IngressStats; + TInstant StartingMessageTimestamp; + const TString LogPrefix; + const NPq::NProto::TDqReadTaskParams ReadParams; + const NActors::TActorId ComputeActorId; + + TDqPqReadActorBase( + ui64 inputIndex, + ui64 taskId, + NActors::TActorId selfId, + const TTxId& txId, + NPq::NProto::TDqPqTopicSource&& sourceParams, + NPq::NProto::TDqReadTaskParams&& readParams, + const NActors::TActorId& computeActorId) + : InputIndex(inputIndex) + , TxId(txId) + , SourceParams(std::move(sourceParams)) + , StartingMessageTimestamp(TInstant::MilliSeconds(TInstant::Now().MilliSeconds())) // this field is serialized as milliseconds, so drop microseconds part to be consistent with storage + , LogPrefix(TStringBuilder() << "SelfId: " << selfId << ", TxId: " << txId << ", task: " << taskId << ". PQ source. ") + , ReadParams(std::move(readParams)) + , ComputeActorId(computeActorId) { + } + +public: + void SaveState(const NDqProto::TCheckpoint& checkpoint, TSourceState& state) override; + void LoadState(const TSourceState& state) override; + + ui64 GetInputIndex() const override; + const TDqAsyncStats& GetIngressStats() const override; + + virtual TString GetSessionId() const { + return TString{"empty"}; + } +}; + +} // namespace NYql::NDq diff --git a/ydb/library/yql/providers/pq/async_io/ya.make b/ydb/library/yql/providers/pq/async_io/ya.make index 46fec067e4ab..5f35c84043ad 100644 --- a/ydb/library/yql/providers/pq/async_io/ya.make +++ b/ydb/library/yql/providers/pq/async_io/ya.make @@ -2,16 +2,22 @@ LIBRARY() SRCS( dq_pq_meta_extractor.cpp + dq_pq_rd_read_actor.cpp dq_pq_read_actor.cpp + dq_pq_read_actor_base.cpp dq_pq_write_actor.cpp probes.cpp ) PEERDIR( + ydb/core/fq/libs/graph_params/proto + ydb/core/fq/libs/protos + ydb/core/fq/libs/row_dispatcher ydb/library/actors/log_backend ydb/library/yql/dq/actors/compute ydb/library/yql/minikql/computation ydb/library/yql/providers/common/token_accessor/client + ydb/library/yql/providers/dq/api/protos ydb/library/yql/providers/pq/common ydb/library/yql/providers/pq/proto ydb/library/yql/public/types diff --git a/ydb/library/yql/providers/pq/common/yql_names.h b/ydb/library/yql/providers/pq/common/yql_names.h index 268944e9bbe8..c4f6eeb3cc42 100644 --- a/ydb/library/yql/providers/pq/common/yql_names.h +++ b/ydb/library/yql/providers/pq/common/yql_names.h @@ -7,6 +7,8 @@ namespace NYql { constexpr TStringBuf PartitionsCountProp = "PartitionsCount"; constexpr TStringBuf ConsumerSetting = "Consumer"; constexpr TStringBuf EndpointSetting = "Endpoint"; +constexpr TStringBuf SharedReading = "SharedReading"; +constexpr TStringBuf Format = "Format"; constexpr TStringBuf UseSslSetting = "UseSsl"; constexpr TStringBuf AddBearerToTokenSetting = "AddBearerToToken"; constexpr TStringBuf WatermarksEnableSetting = "WatermarksEnable"; diff --git a/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json b/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json index e43a69ba1753..8a8f172d307f 100644 --- a/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json +++ b/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json @@ -70,7 +70,9 @@ {"Index": 0, "Name": "Topic", "Type": "TPqTopic"}, {"Index": 1, "Name": "Columns", "Type": "TExprBase"}, {"Index": 2, "Name": "Settings", "Type": "TCoNameValueTupleList"}, - {"Index": 3, "Name": "Token", "Type": "TCoSecureParam"} + {"Index": 3, "Name": "Token", "Type": "TCoSecureParam"}, + {"Index": 4, "Name": "FilterPredicate", "Type": "TCoLambda"}, + {"Index": 5, "Name": "ColumnTypes", "Type": "TExprBase"} ] }, { diff --git a/ydb/library/yql/providers/pq/proto/dq_io.proto b/ydb/library/yql/providers/pq/proto/dq_io.proto index 093420888516..1f9a17b71782 100644 --- a/ydb/library/yql/providers/pq/proto/dq_io.proto +++ b/ydb/library/yql/providers/pq/proto/dq_io.proto @@ -33,6 +33,10 @@ message TDqPqTopicSource { string DatabaseId = 9; repeated string MetadataFields = 10; TWatermarks Watermarks = 11; + repeated string Columns = 12; + repeated string ColumnTypes = 13; + string Predicate = 14; + bool SharedReading = 15; } message TDqPqTopicSink { diff --git a/ydb/library/yql/providers/pq/provider/ya.make b/ydb/library/yql/providers/pq/provider/ya.make index e51b9ebd82ff..be8405e07576 100644 --- a/ydb/library/yql/providers/pq/provider/ya.make +++ b/ydb/library/yql/providers/pq/provider/ya.make @@ -24,29 +24,31 @@ PEERDIR( library/cpp/random_provider library/cpp/time_provider ydb/library/yql/ast - ydb/library/yql/minikql/comp_nodes - ydb/library/yql/providers/common/db_id_async_resolver - ydb/library/yql/providers/common/structured_token - ydb/library/yql/public/udf - ydb/public/sdk/cpp/client/ydb_driver ydb/library/yql/core ydb/library/yql/core/type_ann ydb/library/yql/dq/expr_nodes ydb/library/yql/dq/integration ydb/library/yql/dq/opt + ydb/library/yql/minikql/comp_nodes ydb/library/yql/providers/common/config + ydb/library/yql/providers/common/db_id_async_resolver ydb/library/yql/providers/common/dq ydb/library/yql/providers/common/proto ydb/library/yql/providers/common/provider + ydb/library/yql/providers/common/pushdown + ydb/library/yql/providers/common/structured_token ydb/library/yql/providers/common/transform ydb/library/yql/providers/dq/common ydb/library/yql/providers/dq/expr_nodes ydb/library/yql/providers/dq/provider/exec + ydb/library/yql/providers/generic/provider ydb/library/yql/providers/pq/cm_client ydb/library/yql/providers/pq/common ydb/library/yql/providers/pq/expr_nodes ydb/library/yql/providers/pq/proto ydb/library/yql/providers/result/expr_nodes + ydb/library/yql/public/udf + ydb/public/sdk/cpp/client/ydb_driver ) YQL_LAST_ABI_VERSION() diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp b/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp index 4b2b54c7b008..cd171c8dd446 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp +++ b/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -131,7 +132,7 @@ class TPqDataSourceTypeAnnotationTransformer : public TVisitorTransformerBase { } TStatus HandleDqTopicSource(TExprBase input, TExprContext& ctx) { - if (!EnsureArgsCount(input.Ref(), 4, ctx)) { + if (!EnsureArgsCount(input.Ref(), 6, ctx)) { return TStatus::Error; } @@ -150,6 +151,13 @@ class TPqDataSourceTypeAnnotationTransformer : public TVisitorTransformerBase { return TStatus::Error; } + auto rowSchema = topic.RowSpec().Ref().GetTypeAnn()->Cast()->GetType()->Cast(); + + const TStatus filterAnnotationStatus = NYql::NPushdown::AnnotateFilterPredicate(input.Ptr(), TDqPqTopicSource::idx_FilterPredicate, rowSchema, ctx); + if (filterAnnotationStatus != TStatus::Ok) { + return filterAnnotationStatus; + } + if (topic.Metadata().Empty()) { input.Ptr()->SetTypeAnn(ctx.MakeType(ctx.MakeType(EDataSlot::String))); return TStatus::Ok; diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp b/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp index f98c58d173d6..bac0ba92fbc7 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp +++ b/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -85,6 +87,8 @@ class TPqDqIntegration: public TDqIntegrationBase { .Value(pqReadTopic.Format()) .Done()); + auto format = pqReadTopic.Format().Ref().Content(); + TVector innerSettings; if (pqReadTopic.Compression() != "") { innerSettings.push_back(Build(ctx, pqReadTopic.Pos()) @@ -119,24 +123,47 @@ class TPqDqIntegration: public TDqIntegrationBase { .Done()); const auto token = "cluster:default_" + clusterName; - auto columns = pqReadTopic.Columns().Ptr(); - if (!columns->IsList()) { - const auto pos = columns->Pos(); - const auto& items = rowType->GetItems(); - TExprNode::TListType cols; - cols.reserve(items.size()); - std::transform(items.cbegin(), items.cend(), std::back_inserter(cols), [&](const TItemExprType* item) { return ctx.NewAtom(pos, item->GetName()); }); - columns = ctx.NewList(pos, std::move(cols)); - } + + auto rowSchema = pqReadTopic.Topic().RowSpec().Ref().GetTypeAnn()->Cast()->GetType()->Cast(); + TExprNode::TListType colTypes; + const auto& typeItems = rowSchema->GetItems(); + colTypes.reserve(typeItems.size()); + const auto pos = read->Pos(); // TODO + std::transform(typeItems.cbegin(), typeItems.cend(), std::back_inserter(colTypes), + [&](const TItemExprType* item) { + return ctx.NewAtom(pos, FormatType(item->GetItemType())); + }); + auto columnTypes = ctx.NewList(pos, std::move(colTypes)); + + TExprNode::TListType colNames; + colNames.reserve(typeItems.size()); + std::transform(typeItems.cbegin(), typeItems.cend(), std::back_inserter(colNames), + [&](const TItemExprType* item) { + return ctx.NewAtom(pos, item->GetName()); + }); + auto columnNames = ctx.NewList(pos, std::move(colNames)); + + auto row = Build(ctx, read->Pos()) + .Name("row") + .Done(); + auto emptyPredicate = Build(ctx, read->Pos()) + .Args({row}) + .Body() + .Literal().Build("true") + .Build() + .Done().Ptr(); + return Build(ctx, read->Pos()) .Input() .Topic(pqReadTopic.Topic()) - .Columns(std::move(columns)) - .Settings(BuildTopicReadSettings(clusterName, dqSettings, read->Pos(), ctx)) + .Columns(std::move(columnNames)) + .Settings(BuildTopicReadSettings(clusterName, dqSettings, read->Pos(), format, ctx)) .Token() .Name().Build(token) .Build() + .FilterPredicate(emptyPredicate) + .ColumnTypes(std::move(columnTypes)) .Build() .RowType(ExpandType(pqReadTopic.Pos(), *rowType, ctx)) .DataSource(pqReadTopic.DataSource().Cast()) @@ -179,7 +206,7 @@ class TPqDqIntegration: public TDqIntegrationBase { } } - void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& protoSettings, TString& sourceType, size_t) override { + void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& protoSettings, TString& sourceType, size_t, TExprContext& ctx) override { if (auto maybeDqSource = TMaybeNode(&node)) { auto settings = maybeDqSource.Cast().Settings(); if (auto maybeTopicSource = TMaybeNode(settings.Raw())) { @@ -195,6 +222,8 @@ class TPqDqIntegration: public TDqIntegrationBase { srcDesc.SetClusterType(ToClusterType(clusterDesc->ClusterType)); srcDesc.SetDatabaseId(clusterDesc->DatabaseId); + bool sharedReading = false; + TString format; size_t const settingsCount = topicSource.Settings().Size(); for (size_t i = 0; i < settingsCount; ++i) { TCoNameValueTuple setting = topicSource.Settings().Item(i); @@ -203,6 +232,10 @@ class TPqDqIntegration: public TDqIntegrationBase { srcDesc.SetConsumerName(TString(Value(setting))); } else if (name == EndpointSetting) { srcDesc.SetEndpoint(TString(Value(setting))); + } else if (name == SharedReading) { + sharedReading = FromString(Value(setting)); + } else if (name == Format) { + format = TString(Value(setting)); } else if (name == UseSslSetting) { srcDesc.SetUseSsl(FromString(Value(setting))); } else if (name == AddBearerToTokenSetting) { @@ -230,7 +263,33 @@ class TPqDqIntegration: public TDqIntegrationBase { srcDesc.AddMetadataFields(metadata.Value().Maybe().Cast().StringValue()); } + for (const auto& column : topicSource.Columns().Cast()) { + srcDesc.AddColumns(column.StringValue()); + } + + for (const auto& columnTypes : topicSource.ColumnTypes().Cast()) { + srcDesc.AddColumnTypes(columnTypes.StringValue()); + } + + NYql::NConnector::NApi::TPredicate predicateProto; + if (auto predicate = topicSource.FilterPredicate(); !NYql::IsEmptyFilterPredicate(predicate)) { + TStringBuilder err; + if (!NYql::SerializeFilterPredicate(predicate, &predicateProto, err)) { + ythrow yexception() << "Failed to serialize filter predicate for source: " << err; + } + } + + //sharedReading = true; + sharedReading = sharedReading && (format == "json_each_row"); + TString predicateSql = NYql::FormatWhere(predicateProto); + if (sharedReading) { + srcDesc.SetPredicate(predicateSql); + srcDesc.SetSharedReading(true); + } protoSettings.PackFrom(srcDesc); + if (sharedReading && !predicateSql.empty()) { + ctx.AddWarning(TIssue(ctx.GetPosition(node.Pos()), "Row dispatcher will use the predicate: " + predicateSql)); + } sourceType = "PqSource"; } } @@ -278,6 +337,7 @@ class TPqDqIntegration: public TDqIntegrationBase { const TString& cluster, const TDqSettings& dqSettings, TPositionHandle pos, + std::string_view format, TExprContext& ctx) const { TVector props; @@ -295,6 +355,10 @@ class TPqDqIntegration: public TDqIntegrationBase { } Add(props, EndpointSetting, clusterConfiguration->Endpoint, pos, ctx); + Add(props, SharedReading, ToString(clusterConfiguration->SharedReading), pos, ctx); + Add(props, Format, format, pos, ctx); + + if (clusterConfiguration->UseSsl) { Add(props, UseSslSetting, "1", pos, ctx); } diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_logical_opt.cpp b/ydb/library/yql/providers/pq/provider/yql_pq_logical_opt.cpp index 8650ae9b2f8b..92964948185a 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_logical_opt.cpp +++ b/ydb/library/yql/providers/pq/provider/yql_pq_logical_opt.cpp @@ -10,13 +10,25 @@ #include #include #include +#include +#include +#include +#include namespace NYql { using namespace NNodes; namespace { + struct TPushdownSettings: public NPushdown::TSettings { + TPushdownSettings() + : NPushdown::TSettings(NLog::EComponent::ProviderGeneric) + { + using EFlag = NPushdown::TSettings::EFeatureFlag; + Enable(EFlag::ExpressionAsPredicate | EFlag::ArithmeticalExpressions | EFlag::ImplicitConversionToInt64 | EFlag::StringTypes | EFlag::LikeOperator); + } + }; std::unordered_set GetUsedMetadataFields(const TCoExtractMembers& extract) { std::unordered_set usedMetadataFields; @@ -123,6 +135,7 @@ class TPqLogicalOptProposalTransformer : public TOptimizeTransformerBase { #define HNDL(name) "LogicalOptimizer-"#name, Hndl(&TPqLogicalOptProposalTransformer::name) // AddHandler(0, &TCoExtractMembers::Match, HNDL(ExtractMembers)); AddHandler(0, &TCoExtractMembers::Match, HNDL(ExtractMembersOverDqWrap)); + AddHandler(0, &TCoFlatMap::Match, HNDL(PushFilterToPqTopicSource)); #undef HNDL } @@ -200,6 +213,71 @@ class TPqLogicalOptProposalTransformer : public TOptimizeTransformerBase { .Input(ctx.ReplaceNode(input.Ptr(), dqSourceWrap.Ref(), newDqSourceWrap)) .Done(); } + + bool IsEmptyFilterPredicate(const TCoLambda& lambda) const { + auto maybeBool = lambda.Body().Maybe(); + if (!maybeBool) { + return false; + } + return TStringBuf(maybeBool.Cast().Literal()) == "true"sv; + } + + TMaybeNode PushFilterToPqTopicSource(TExprBase node, TExprContext& ctx) const { + auto flatmap = node.Cast(); + auto maybeExtractMembers = flatmap.Input().Maybe(); + + auto maybeDqSourceWrap = + maybeExtractMembers + ? maybeExtractMembers.Cast().Input().Maybe() + : flatmap.Input().Maybe(); + ; + if (!maybeDqSourceWrap) { + return node; + } + TDqSourceWrap dqSourceWrap = maybeDqSourceWrap.Cast(); + auto maybeDqPqTopicSource = dqSourceWrap.Input().Maybe(); + if (!maybeDqPqTopicSource) { + return node; + } + TDqPqTopicSource dqPqTopicSource = maybeDqPqTopicSource.Cast(); + if (!IsEmptyFilterPredicate(dqPqTopicSource.FilterPredicate())) { + YQL_CLOG(TRACE, ProviderPq) << "Push filter. Lambda is already not empty"; + return node; + } + + auto newFilterLambda = MakePushdownPredicate(flatmap.Lambda(), ctx, node.Pos(), TPushdownSettings()); + if (!newFilterLambda) { + ctx.AddWarning(TIssue(ctx.GetPosition(node.Pos()), "No predicate to pushdown")); + return node; + } + YQL_CLOG(INFO, ProviderPq) << "Build new TCoFlatMap with predicate"; + + if (maybeExtractMembers) { + return Build(ctx, flatmap.Pos()) + .InitFrom(flatmap) + .Input() + .InitFrom(maybeExtractMembers.Cast()) + .Input() + .InitFrom(dqSourceWrap) + .Input() + .InitFrom(dqPqTopicSource) + .FilterPredicate(newFilterLambda.Cast()) + .Build() + .Build() + .Build() + .Done(); + } + return Build(ctx, flatmap.Pos()) + .InitFrom(flatmap) + .Input() + .InitFrom(dqSourceWrap) + .Input() + .InitFrom(dqPqTopicSource) + .FilterPredicate(newFilterLambda.Cast()) + .Build() + .Build() + .Done(); + } private: TPqState::TPtr State_; diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_settings.cpp b/ydb/library/yql/providers/pq/provider/yql_pq_settings.cpp index c424fa9d0e9f..5b97002b9ad1 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_settings.cpp +++ b/ydb/library/yql/providers/pq/provider/yql_pq_settings.cpp @@ -42,6 +42,7 @@ void TPqConfiguration::Init( clusterSettings.TvmId = cluster.GetTvmId(); clusterSettings.UseSsl = cluster.GetUseSsl(); clusterSettings.AddBearerToToken = cluster.GetAddBearerToToken(); + clusterSettings.SharedReading = cluster.GetSharedReading(); const TString authToken = typeCtx->Credentials->FindCredentialContent("cluster:default_" + clusterSettings.ClusterName, "default_pq", cluster.GetToken()); clusterSettings.AuthToken = authToken; diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_settings.h b/ydb/library/yql/providers/pq/provider/yql_pq_settings.h index a720506ef122..672effd42fc8 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_settings.h +++ b/ydb/library/yql/providers/pq/provider/yql_pq_settings.h @@ -29,6 +29,7 @@ struct TPqClusterConfigurationSettings { ui32 TvmId = 0; TString AuthToken; bool AddBearerToToken = false; + bool SharedReading = false; }; struct TPqConfiguration : public TPqSettings, public NCommon::TSettingDispatcher { diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp index 50fdf1894ef8..eca951e7405c 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_raw_read_actor.cpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp index f07a7951f6dc..795b8b950a63 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_read_actor.cpp @@ -48,7 +48,7 @@ #include #include -#include +#include #include #include #include diff --git a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp index 918953ad5b8d..d498ea244e8b 100644 --- a/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp +++ b/ydb/library/yql/providers/s3/actors/yql_s3_source_queue.cpp @@ -45,7 +45,7 @@ #include #include -#include +#include #include #include #include diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp index 3d5d7adc3179..d377e84ae8ca 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -358,7 +357,7 @@ class TS3DqIntegration: public TDqIntegrationBase { return read; } - void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& protoSettings, TString& sourceType, size_t maxPartitions) override { + void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& protoSettings, TString& sourceType, size_t maxPartitions, TExprContext&) override { const TDqSource source(&node); if (const auto maySettings = source.Settings().Maybe()) { const auto settings = maySettings.Cast(); diff --git a/ydb/library/yql/providers/solomon/provider/yql_solomon_dq_integration.cpp b/ydb/library/yql/providers/solomon/provider/yql_solomon_dq_integration.cpp index 58631fd3e9d1..b675a22a91c6 100644 --- a/ydb/library/yql/providers/solomon/provider/yql_solomon_dq_integration.cpp +++ b/ydb/library/yql/providers/solomon/provider/yql_solomon_dq_integration.cpp @@ -238,7 +238,7 @@ class TSolomonDqIntegration: public TDqIntegrationBase { return TSoWrite::Match(&write); } - void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& protoSettings, TString& sourceType, size_t) override { + void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& protoSettings, TString& sourceType, size_t, TExprContext&) override { const TDqSource dqSource(&node); const auto maybeSettings = dqSource.Settings().Maybe(); if (!maybeSettings) { diff --git a/ydb/library/yql/providers/ydb/provider/yql_ydb_dq_integration.cpp b/ydb/library/yql/providers/ydb/provider/yql_ydb_dq_integration.cpp index 28d4aebde0f2..7ff7fc1c5cc7 100644 --- a/ydb/library/yql/providers/ydb/provider/yql_ydb_dq_integration.cpp +++ b/ydb/library/yql/providers/ydb/provider/yql_ydb_dq_integration.cpp @@ -114,7 +114,7 @@ class TYdbDqIntegration: public TDqIntegrationBase { return read; } - void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& protoSettings, TString& sourceType, size_t) override { + void FillSourceSettings(const TExprNode& node, ::google::protobuf::Any& protoSettings, TString& sourceType, size_t, TExprContext&) override { const TDqSource source(&node); if (const auto maySettings = source.Settings().Maybe()) { const auto settings = maySettings.Cast(); diff --git a/ydb/library/yql/public/purecalc/common/no_llvm/ya.make b/ydb/library/yql/public/purecalc/common/no_llvm/ya.make index 46b4d25a9aa9..9de5baf9ca0f 100644 --- a/ydb/library/yql/public/purecalc/common/no_llvm/ya.make +++ b/ydb/library/yql/public/purecalc/common/no_llvm/ya.make @@ -11,6 +11,7 @@ PEERDIR( ydb/library/yql/minikql/codegen/no_llvm ydb/library/yql/parser/pg_wrapper ydb/library/yql/parser/pg_wrapper/interface + ydb/library/yql/sql/pg ) END() diff --git a/ydb/library/yql/public/purecalc/common/no_pg_wrapper/ya.make b/ydb/library/yql/public/purecalc/common/no_pg_wrapper/ya.make new file mode 100644 index 000000000000..047ae07ac8ed --- /dev/null +++ b/ydb/library/yql/public/purecalc/common/no_pg_wrapper/ya.make @@ -0,0 +1,5 @@ +LIBRARY() + +INCLUDE(../ya.make.inc) + +END() diff --git a/ydb/library/yql/public/purecalc/common/ya.make b/ydb/library/yql/public/purecalc/common/ya.make index a526f25235ba..003134aa2718 100644 --- a/ydb/library/yql/public/purecalc/common/ya.make +++ b/ydb/library/yql/public/purecalc/common/ya.make @@ -10,6 +10,7 @@ PEERDIR( ydb/library/yql/minikql/comp_nodes/llvm14 ydb/library/yql/parser/pg_wrapper ydb/library/yql/parser/pg_wrapper/interface + ydb/library/yql/sql/pg ) END() diff --git a/ydb/library/yql/public/purecalc/common/ya.make.inc b/ydb/library/yql/public/purecalc/common/ya.make.inc index 8404177207c3..5dffa7926e20 100644 --- a/ydb/library/yql/public/purecalc/common/ya.make.inc +++ b/ydb/library/yql/public/purecalc/common/ya.make.inc @@ -27,7 +27,6 @@ SRCS( ) PEERDIR( - ydb/library/yql/sql/pg ydb/library/yql/ast ydb/library/yql/core/services ydb/library/yql/core/services/mounts @@ -36,7 +35,6 @@ PEERDIR( ydb/library/yql/utils/log ydb/library/yql/core ydb/library/yql/core/type_ann - ydb/library/yql/parser/pg_wrapper ydb/library/yql/providers/common/codec ydb/library/yql/providers/common/comp_nodes ydb/library/yql/providers/common/mkql diff --git a/ydb/public/api/protos/draft/fq.proto b/ydb/public/api/protos/draft/fq.proto index f76c01ef4944..0666228e5339 100644 --- a/ydb/public/api/protos/draft/fq.proto +++ b/ydb/public/api/protos/draft/fq.proto @@ -447,6 +447,7 @@ message DataStreams { string endpoint = 3 [(Ydb.length).le = 1024]; string database = 4 [(Ydb.length).le = 1024]; bool secure = 5; + bool shared_reading = 6; } message Monitoring { diff --git a/ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp b/ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp new file mode 100644 index 000000000000..350c0bd5b40c --- /dev/null +++ b/ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp @@ -0,0 +1,359 @@ +#include + +#include + +#include +#include + +#include + +namespace NYql::NDq { + +const ui64 PartitionId = 666; + +struct TFixture : public TPqIoTestFixture { + + TFixture() { + LocalRowDispatcherId = CaSetup->Runtime->AllocateEdgeActor(); + Coordinator1Id = CaSetup->Runtime->AllocateEdgeActor(); + Coordinator2Id = CaSetup->Runtime->AllocateEdgeActor(); + RowDispatcher1 = CaSetup->Runtime->AllocateEdgeActor(); + RowDispatcher2 = CaSetup->Runtime->AllocateEdgeActor(); + } + + void InitRdSource( + NYql::NPq::NProto::TDqPqTopicSource&& settings, + i64 freeSpace = 1_MB) + { + CaSetup->Execute([&](TFakeActor& actor) { + NPq::NProto::TDqReadTaskParams params; + auto* partitioninigParams = params.MutablePartitioningParams(); + partitioninigParams->SetTopicPartitionsCount(1); + partitioninigParams->SetEachTopicPartitionGroupId(PartitionId); + partitioninigParams->SetDqPartitionsCount(1); + + TString serializedParams; + Y_PROTOBUF_SUPPRESS_NODISCARD params.SerializeToString(&serializedParams); + + const THashMap secureParams; + const THashMap taskParams { {"pq", serializedParams} }; + + auto [dqSource, dqSourceAsActor] = CreateDqPqRdReadActor( + std::move(settings), + 0, + NYql::NDq::TCollectStatsLevel::None, + "query_1", + 0, + secureParams, + taskParams, + actor.SelfId(), // computeActorId + LocalRowDispatcherId, + actor.GetHolderFactory(), + MakeIntrusive(), + freeSpace); + + actor.InitAsyncInput(dqSource, dqSourceAsActor); + }); + } + + void ExpectCoordinatorChangesSubscribe() { + auto eventHolder = CaSetup->Runtime->GrabEdgeEvent(LocalRowDispatcherId, TDuration::Seconds(5)); + UNIT_ASSERT(eventHolder.Get() != nullptr); + } + + auto ExpectCoordinatorRequest(NActors::TActorId coordinatorId) { + auto eventHolder = CaSetup->Runtime->GrabEdgeEvent(coordinatorId, TDuration::Seconds(5)); + UNIT_ASSERT(eventHolder.Get() != nullptr); + return eventHolder.Get(); + } + + void ExpectStartSession(ui64 expectedOffset, NActors::TActorId rowDispatcherId) { + auto eventHolder = CaSetup->Runtime->GrabEdgeEvent(rowDispatcherId, TDuration::Seconds(5)); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->Record.GetOffset() == expectedOffset); + } + + void ExpectStopSession(NActors::TActorId rowDispatcherId) { + auto eventHolder = CaSetup->Runtime->GrabEdgeEvent(rowDispatcherId, TDuration::Seconds(5)); + UNIT_ASSERT(eventHolder.Get() != nullptr); + } + + void ExpectGetNextBatch(NActors::TActorId rowDispatcherId) { + auto eventHolder = CaSetup->Runtime->GrabEdgeEvent(rowDispatcherId, TDuration::Seconds(5)); + UNIT_ASSERT(eventHolder.Get() != nullptr); + UNIT_ASSERT(eventHolder->Get()->Record.GetPartitionId() == PartitionId); + } + + void MockCoordinatorChanged(NActors::TActorId coordinatorId) { + CaSetup->Execute([&](TFakeActor& actor) { + auto event = new NFq::TEvRowDispatcher::TEvCoordinatorChanged(coordinatorId); + CaSetup->Runtime->Send(new NActors::IEventHandle(*actor.DqAsyncInputActorId, LocalRowDispatcherId, event)); + }); + } + + void MockCoordinatorResult(NActors::TActorId rowDispatcherId, ui64 cookie = 0) { + CaSetup->Execute([&](TFakeActor& actor) { + auto event = new NFq::TEvRowDispatcher::TEvCoordinatorResult(); + auto* partitions = event->Record.AddPartitions(); + partitions->AddPartitionId(PartitionId); + ActorIdToProto(rowDispatcherId, partitions->MutableActorId()); + CaSetup->Runtime->Send(new NActors::IEventHandle(*actor.DqAsyncInputActorId, Coordinator1Id, event, 0, cookie)); + }); + } + + void MockAck(NActors::TActorId rowDispatcherId) { + CaSetup->Execute([&](TFakeActor& actor) { + NFq::NRowDispatcherProto::TEvStartSession proto; + proto.SetPartitionId(PartitionId); + auto event = new NFq::TEvRowDispatcher::TEvStartSessionAck(proto); + CaSetup->Runtime->Send(new NActors::IEventHandle(*actor.DqAsyncInputActorId, rowDispatcherId, event)); + }); + } + + void MockNewDataArrived(NActors::TActorId rowDispatcherId) { + CaSetup->Execute([&](TFakeActor& actor) { + auto event = new NFq::TEvRowDispatcher::TEvNewDataArrived(); + event->Record.SetPartitionId(PartitionId); + CaSetup->Runtime->Send(new NActors::IEventHandle(*actor.DqAsyncInputActorId, rowDispatcherId, event)); + }); + } + + void MockMessageBatch(ui64 offset, const std::vector& jsons, NActors::TActorId rowDispatcherId) { + CaSetup->Execute([&](TFakeActor& actor) { + auto event = new NFq::TEvRowDispatcher::TEvMessageBatch(); + for (const auto& json :jsons) { + NFq::NRowDispatcherProto::TEvMessage message; + message.SetJson(json); + message.SetOffset(offset++); + *event->Record.AddMessages() = message; + } + event->Record.SetPartitionId(PartitionId); + event->Record.SetNextMessageOffset(offset); + CaSetup->Runtime->Send(new NActors::IEventHandle(*actor.DqAsyncInputActorId, rowDispatcherId, event)); + }); + } + + void MockSessionError() { + CaSetup->Execute([&](TFakeActor& actor) { + auto event = new NFq::TEvRowDispatcher::TEvSessionError(); + event->Record.SetMessage("A problem has been detected and session has been shut down to prevent damage your life"); + event->Record.SetPartitionId(PartitionId); + CaSetup->Runtime->Send(new NActors::IEventHandle(*actor.DqAsyncInputActorId, RowDispatcher1, event)); + }); + } + + template + void AssertDataWithWatermarks( + const std::vector>& actual, + const std::vector& expected, + const std::vector& watermarkBeforePositions) + { + auto expectedPos = 0U; + auto watermarksBeforeIter = watermarkBeforePositions.begin(); + + for (auto item : actual) { + if (std::holds_alternative(item)) { + if (watermarksBeforeIter != watermarkBeforePositions.end()) { + watermarksBeforeIter++; + } + continue; + } else { + UNIT_ASSERT_C(expectedPos < expected.size(), "Too many data items"); + UNIT_ASSERT_C( + watermarksBeforeIter == watermarkBeforePositions.end() || + *watermarksBeforeIter > expectedPos, + "Watermark before item on position " << expectedPos << " was expected"); + UNIT_ASSERT_EQUAL(std::get(item), expected.at(expectedPos)); + expectedPos++; + } + } + } + + void MockDisconnected() { + CaSetup->Execute([&](TFakeActor& actor) { + auto event = new NActors::TEvInterconnect::TEvNodeDisconnected(CaSetup->Runtime->GetNodeId(0)); + CaSetup->Runtime->Send(new NActors::IEventHandle(*actor.DqAsyncInputActorId, RowDispatcher1, event)); + }); + } + + void MockConnected() { + CaSetup->Execute([&](TFakeActor& actor) { + auto event = new NActors::TEvInterconnect::TEvNodeConnected(CaSetup->Runtime->GetNodeId(0)); + CaSetup->Runtime->Send(new NActors::IEventHandle(*actor.DqAsyncInputActorId, RowDispatcher1, event)); + }); + } + + void MockUndelivered() { + CaSetup->Execute([&](TFakeActor& actor) { + auto event = new NActors::TEvents::TEvUndelivered(0, NActors::TEvents::TEvUndelivered::ReasonActorUnknown); + CaSetup->Runtime->Send(new NActors::IEventHandle(*actor.DqAsyncInputActorId, RowDispatcher1, event)); + }); + } + + + void StartSession() { + InitRdSource(BuildPqTopicSourceSettings("topicName")); + SourceRead(UVParser); + ExpectCoordinatorChangesSubscribe(); + + MockCoordinatorChanged(Coordinator1Id); + auto req =ExpectCoordinatorRequest(Coordinator1Id); + + MockCoordinatorResult(RowDispatcher1, req->Cookie); + ExpectStartSession(0, RowDispatcher1); + MockAck(RowDispatcher1); + } + + void ProcessSomeJsons(ui64 offset, const std::vector& jsons, NActors::TActorId rowDispatcherId) { + MockNewDataArrived(rowDispatcherId); + ExpectGetNextBatch(rowDispatcherId); + + MockMessageBatch(offset, jsons, rowDispatcherId); + + auto result = SourceReadDataUntil(UVParser, jsons.size()); + AssertDataWithWatermarks(result, jsons, {}); + } + + const TString Json1 = "{\"dt\":100,\"value\":\"value1\"}"; + const TString Json2 = "{\"dt\":200,\"value\":\"value2\"}"; + const TString Json3 = "{\"dt\":300,\"value\":\"value3\"}"; + const TString Json4 = "{\"dt\":400,\"value\":\"value4\"}"; + + NActors::TActorId LocalRowDispatcherId; + NActors::TActorId Coordinator1Id; + NActors::TActorId Coordinator2Id; + NActors::TActorId RowDispatcher1; + NActors::TActorId RowDispatcher2; +}; + +Y_UNIT_TEST_SUITE(TDqPqRdReadActorTests) { + Y_UNIT_TEST_F(TestReadFromTopic, TFixture) { + StartSession(); + ProcessSomeJsons(0, {Json1, Json2}, RowDispatcher1); + } + + Y_UNIT_TEST_F(SessionError, TFixture) { + StartSession(); + + TInstant deadline = Now() + TDuration::Seconds(5); + auto future = CaSetup->AsyncInputPromises.FatalError.GetFuture(); + MockSessionError(); + + bool failured = false; + while (Now() < deadline) { + SourceRead(UVParser); + if (future.HasValue()) { + UNIT_ASSERT_STRING_CONTAINS(future.GetValue().ToOneLineString(), "damage your life"); + failured = true; + break; + } + } + UNIT_ASSERT_C(failured, "Failure timeout"); + } + + Y_UNIT_TEST_F(ReadWithFreeSpace, TFixture) { + StartSession(); + + MockNewDataArrived(RowDispatcher1); + ExpectGetNextBatch(RowDispatcher1); + + const std::vector data1 = {Json1, Json2}; + MockMessageBatch(0, data1, RowDispatcher1); + + const std::vector data2 = {Json3, Json4}; + MockMessageBatch(2, data2, RowDispatcher1); + + auto result = SourceReadDataUntil(UVParser, 1, 1); + std::vector expected{data1}; + AssertDataWithWatermarks(result, expected, {}); + + UNIT_ASSERT_EQUAL(SourceRead(UVParser, 0).size(), 0); + } + + Y_UNIT_TEST(TestSaveLoadPqRdRead) { + TSourceState state; + + { + TFixture f; + f.StartSession(); + f.ProcessSomeJsons(0, {f.Json1, f.Json2}, f.RowDispatcher1); // offsets: 0, 1 + + f.SaveSourceState(CreateCheckpoint(), state); + Cerr << "State saved" << Endl; + } + { + TFixture f; + f.InitRdSource(BuildPqTopicSourceSettings("topicName")); + f.SourceRead(UVParser); + f.LoadSource(state); + f.SourceRead(UVParser); + f.ExpectCoordinatorChangesSubscribe(); + + f.MockCoordinatorChanged(f.Coordinator1Id); + auto req = f.ExpectCoordinatorRequest(f.Coordinator1Id); + + f.MockCoordinatorResult(f.RowDispatcher1, req->Cookie); + f.ExpectStartSession(2, f.RowDispatcher1); + f.MockAck(f.RowDispatcher1); + + f.ProcessSomeJsons(2, {f.Json3}, f.RowDispatcher1); // offsets: 2 + state.Data.clear(); + f.SaveSourceState(CreateCheckpoint(), state); + Cerr << "State saved" << Endl; + } + { + TFixture f; + f.InitRdSource(BuildPqTopicSourceSettings("topicName")); + f.SourceRead(UVParser); + f.LoadSource(state); + f.SourceRead(UVParser); + f.ExpectCoordinatorChangesSubscribe(); + + f.MockCoordinatorChanged(f.Coordinator1Id); + auto req = f.ExpectCoordinatorRequest(f.Coordinator1Id); + + f.MockCoordinatorResult(f.RowDispatcher1, req->Cookie); + f.ExpectStartSession(3, f.RowDispatcher1); + f.MockAck(f.RowDispatcher1); + + f.ProcessSomeJsons(3, {f.Json4}, f.RowDispatcher1); // offsets: 3 + } + } + + Y_UNIT_TEST_F(CoordinatorChanged, TFixture) { + StartSession(); + ProcessSomeJsons(0, {Json1, Json2}, RowDispatcher1); + MockMessageBatch(2, {Json3}, RowDispatcher1); + + // change active Coordinator + MockCoordinatorChanged(Coordinator2Id); + ExpectStopSession(RowDispatcher1); + + auto result = SourceReadDataUntil(UVParser, 1); + AssertDataWithWatermarks(result, {Json3}, {}); + + auto req = ExpectCoordinatorRequest(Coordinator2Id); + MockCoordinatorResult(RowDispatcher2, req->Cookie); + + ExpectStartSession(3, RowDispatcher2); + MockAck(RowDispatcher2); + + ProcessSomeJsons(3, {Json4}, RowDispatcher2); + } + + Y_UNIT_TEST_F(RowDispatcherIsRestarted, TFixture) { + StartSession(); + ProcessSomeJsons(0, {Json1, Json2}, RowDispatcher1); + MockDisconnected(); + MockConnected(); + MockUndelivered(); + + auto req = ExpectCoordinatorRequest(Coordinator1Id); + MockCoordinatorResult(RowDispatcher1, req->Cookie); + ExpectStartSession(2, RowDispatcher1); + MockAck(RowDispatcher1); + + ProcessSomeJsons(2, {Json3}, RowDispatcher1); + } +} +} // NYql::NDq diff --git a/ydb/tests/fq/pq_async_io/dq_pq_read_actor_ut.cpp b/ydb/tests/fq/pq_async_io/ut/dq_pq_read_actor_ut.cpp similarity index 99% rename from ydb/tests/fq/pq_async_io/dq_pq_read_actor_ut.cpp rename to ydb/tests/fq/pq_async_io/ut/dq_pq_read_actor_ut.cpp index fbc1cfd6104f..37438882c234 100644 --- a/ydb/tests/fq/pq_async_io/dq_pq_read_actor_ut.cpp +++ b/ydb/tests/fq/pq_async_io/ut/dq_pq_read_actor_ut.cpp @@ -1,4 +1,4 @@ -#include "ut_helpers.h" +#include #include diff --git a/ydb/tests/fq/pq_async_io/dq_pq_write_actor_ut.cpp b/ydb/tests/fq/pq_async_io/ut/dq_pq_write_actor_ut.cpp similarity index 98% rename from ydb/tests/fq/pq_async_io/dq_pq_write_actor_ut.cpp rename to ydb/tests/fq/pq_async_io/ut/dq_pq_write_actor_ut.cpp index 8bb3be26c27a..9094c7b9f58c 100644 --- a/ydb/tests/fq/pq_async_io/dq_pq_write_actor_ut.cpp +++ b/ydb/tests/fq/pq_async_io/ut/dq_pq_write_actor_ut.cpp @@ -1,4 +1,4 @@ -#include "ut_helpers.h" +#include #include diff --git a/ydb/tests/fq/pq_async_io/ut/ya.make b/ydb/tests/fq/pq_async_io/ut/ya.make new file mode 100644 index 000000000000..82f2450a647d --- /dev/null +++ b/ydb/tests/fq/pq_async_io/ut/ya.make @@ -0,0 +1,28 @@ +UNITTEST_FOR(ydb/library/yql/providers/pq/async_io) + +SIZE(MEDIUM) + +INCLUDE(${ARCADIA_ROOT}/ydb/tests/tools/fq_runner/ydb_runner_with_datastreams.inc) + +SRCS( + dq_pq_rd_read_actor_ut.cpp + dq_pq_read_actor_ut.cpp + dq_pq_write_actor_ut.cpp +) + +PEERDIR( + ydb/core/testlib/basics/default + ydb/library/yql/minikql/comp_nodes/llvm14 + ydb/library/yql/minikql/computation/llvm14 + ydb/library/yql/providers/common/comp_nodes + ydb/library/yql/providers/common/ut_helpers + ydb/library/yql/public/udf/service/exception_policy + ydb/library/yql/sql + ydb/public/sdk/cpp/client/ydb_datastreams + ydb/public/sdk/cpp/client/ydb_persqueue_public + ydb/tests/fq/pq_async_io +) + +YQL_LAST_ABI_VERSION() + +END() diff --git a/ydb/tests/fq/pq_async_io/ut_helpers.h b/ydb/tests/fq/pq_async_io/ut_helpers.h index 96fd267ecf6a..6e9f92007d2b 100644 --- a/ydb/tests/fq/pq_async_io/ut_helpers.h +++ b/ydb/tests/fq/pq_async_io/ut_helpers.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include diff --git a/ydb/tests/fq/pq_async_io/ya.make b/ydb/tests/fq/pq_async_io/ya.make index d8a793a0a047..c27e93ce4ce6 100644 --- a/ydb/tests/fq/pq_async_io/ya.make +++ b/ydb/tests/fq/pq_async_io/ya.make @@ -1,25 +1,14 @@ -UNITTEST_FOR(ydb/library/yql/providers/pq/async_io) - -SIZE(MEDIUM) - -INCLUDE(${ARCADIA_ROOT}/ydb/tests/tools/fq_runner/ydb_runner_with_datastreams.inc) +LIBRARY() SRCS( - dq_pq_read_actor_ut.cpp - dq_pq_write_actor_ut.cpp ut_helpers.cpp ) PEERDIR( - ydb/core/testlib/basics/default ydb/library/yql/minikql/computation/llvm14 - ydb/library/yql/public/udf/service/exception_policy - ydb/library/yql/providers/common/comp_nodes ydb/library/yql/providers/common/ut_helpers - ydb/library/yql/sql ydb/public/sdk/cpp/client/ydb_datastreams ydb/public/sdk/cpp/client/ydb_persqueue_public - ydb/library/yql/minikql/comp_nodes/llvm14 ) YQL_LAST_ABI_VERSION() diff --git a/ydb/tests/fq/yds/test_row_dispatcher.py b/ydb/tests/fq/yds/test_row_dispatcher.py new file mode 100644 index 000000000000..7cafe48661cc --- /dev/null +++ b/ydb/tests/fq/yds/test_row_dispatcher.py @@ -0,0 +1,681 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import pytest +import logging +import time + +from ydb.tests.tools.fq_runner.kikimr_utils import yq_v1 +from ydb.tests.tools.datastreams_helpers.test_yds_base import TestYdsBase + +from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimr +from ydb.tests.tools.fq_runner.kikimr_runner import StreamingOverKikimrConfig +from ydb.tests.tools.fq_runner.kikimr_runner import TenantConfig + +from ydb.tests.tools.datastreams_helpers.control_plane import list_read_rules +from ydb.tests.tools.datastreams_helpers.control_plane import create_stream, create_read_rule +from ydb.tests.tools.datastreams_helpers.data_plane import read_stream, write_stream +from ydb.tests.tools.fq_runner.fq_client import StreamingDisposition + +import ydb.public.api.protos.draft.fq_pb2 as fq + +YDS_CONNECTION = "yds" + + +@pytest.fixture +def kikimr(request): + kikimr_conf = StreamingOverKikimrConfig( + cloud_mode=True, node_count={"/cp": TenantConfig(1), "/compute": TenantConfig(2)} + ) + kikimr = StreamingOverKikimr(kikimr_conf) + kikimr.compute_plane.fq_config['row_dispatcher']['enabled'] = True + kikimr.compute_plane.fq_config['row_dispatcher']['without_consumer'] = True + kikimr.start_mvp_mock_server() + kikimr.start() + yield kikimr + kikimr.stop_mvp_mock_server() + kikimr.stop() + + +def start_yds_query(kikimr, client, sql) -> str: + query_id = client.create_query("simple", sql, type=fq.QueryContent.QueryType.STREAMING).result.query_id + client.wait_query_status(query_id, fq.QueryMeta.RUNNING) + kikimr.compute_plane.wait_zero_checkpoint(query_id) + return query_id + + +def stop_yds_query(client, query_id): + client.abort_query(query_id) + client.wait_query(query_id) + + +def wait_actor_count(kikimr, activity, expected_count): + deadline = time.time() + 60 + while True: + count = 0 + for node_index in kikimr.compute_plane.kikimr_cluster.nodes: + count = count + kikimr.compute_plane.get_actor_count(node_index, activity) + if count == expected_count: + break + assert time.time() < deadline, f"Waiting actor {activity} count failed, current count {count}" + time.sleep(1) + pass + + +def wait_row_dispatcher_sensor_value(kikimr, sensor, expected_count, exact_match=True): + deadline = time.time() + 60 + while True: + count = 0 + for node_index in kikimr.compute_plane.kikimr_cluster.nodes: + value = kikimr.compute_plane.get_sensors(node_index, "yq").find_sensor( + {"subsystem": "row_dispatcher", "sensor": sensor}) + count += value if value is not None else 0 + if count == expected_count: + break + if not exact_match and count > expected_count: + break + assert time.time() < deadline, f"Waiting sensor {sensor} value failed, current count {count}" + time.sleep(1) + pass + + +class TestPqRowDispatcher(TestYdsBase): + + @yq_v1 + def test_read_raw_format_without_row_dispatcher(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_read_raw_format_without_row_dispatcher", create_output=False) + + output_topic = "pq_test_pq_read_write_output" + + create_stream(output_topic, partitions_count=1) + create_read_rule(output_topic, self.consumer_name) + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{output_topic}` + SELECT * FROM {YDS_CONNECTION}.`{self.input_topic}`;''' + + query_id = start_yds_query(kikimr, client, sql) + data = ['{"time" = 101;}', '{"time" = 102;}'] + + self.write_stream(data) + expected = data + assert self.read_stream(len(expected), topic_path=output_topic) == expected + + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + stop_yds_query(client, query_id) + + @yq_v1 + def test_simple_not_null(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_simple_not_null") + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL, data String NOT NULL));''' + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + time.sleep(10) + + data = [ + '{"time": 101, "data": "hello1", "event": "event1"}', + '{"time": 102, "data": "hello2", "event": "event2"}', + '{"time": 103, "data": "hello3", "event": "event3"}', + ] + + self.write_stream(data) + expected = ['101', '102', '103'] + assert self.read_stream(len(expected), topic_path=self.output_topic) == expected + + wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 1) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + stop_yds_query(client, query_id) + # Assert that all read rules were removed after query stops + read_rules = list_read_rules(self.input_topic) + assert len(read_rules) == 0, read_rules + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + @yq_v1 + @pytest.mark.skip(reason="Is not implemented") + def test_simple_optional(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_simple_optional") + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL, data String));''' + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + data = ['{"time": 101, "data": "hello1", "event": "event1"}', '{"time": 102, "event": "event2"}'] + + self.write_stream(data) + expected = ['101', '102'] + assert self.read_stream(len(expected), topic_path=self.output_topic) == expected + + wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 1) + + stop_yds_query(client, query_id) + # Assert that all read rules were removed after query stops + read_rules = list_read_rules(self.input_topic) + assert len(read_rules) == 0, read_rules + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + @yq_v1 + def test_scheme_error(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_scheme_error") + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL, data String NOT NULL));''' + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + data = ['{"this": "is", not json}', '{"noch einmal / nicht json"}'] + self.write_stream(data) + + client.wait_query_status(query_id, fq.QueryMeta.FAILED) + issues = str(client.describe_query(query_id).result.query.issue) + assert "Failed to unwrap empty optional" in issues, "Incorrect Issues: " + issues + + wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 0) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + data = ['{"time": 101, "data": "hello1", "event": "event1"}'] + self.write_stream(data) + expected = ['101'] + assert self.read_stream(len(expected), topic_path=self.output_topic) == expected + stop_yds_query(client, query_id) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + @yq_v1 + def test_filter(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_filter") + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time UInt64 NOT NULL, data String NOT NULL, event String NOT NULL)) + WHERE time > 101UL or event = "event666";''' + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + data = [ + '{"time": 101, "data": "hello1", "event": "event1"}', + '{"time": 102, "data": "hello2", "event": "event2"}', + ] + + self.write_stream(data) + expected = ['102'] + assert self.read_stream(len(expected), topic_path=self.output_topic) == expected + + wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 1) + + stop_yds_query(client, query_id) + # Assert that all read rules were removed after query stops + read_rules = list_read_rules(self.input_topic) + assert len(read_rules) == 0, read_rules + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + issues = str(client.describe_query(query_id).result.query.transient_issue) + assert "Row dispatcher will use the predicate: WHERE (`time` > 101" in issues, "Incorrect Issues: " + issues + + @yq_v1 + def test_filter_with_mr(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_filter_with_mr") + + sql = Rf''' + pragma FeatureR010="prototype"; + pragma config.flags("TimeOrderRecoverDelay", "-10"); + pragma config.flags("TimeOrderRecoverAhead", "10"); + + $data = + SELECT * FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time UInt64 NOT NULL, event_class String NOT NULL, event_type UInt64 NOT NULL)) + WHERE event_class = "event_class2"; + + $match = + SELECT * FROM $data + MATCH_RECOGNIZE( + ORDER BY CAST(time as Timestamp) + MEASURES + LAST(M1.event_type) as event_type + ONE ROW PER MATCH + PATTERN ( M1 ) + DEFINE + M1 as + M1.event_class = "event_class2" + ); + + INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` + SELECT ToBytes(Unwrap(Json::SerializeJson(Yson::From(TableRow())))) FROM $match; + ''' + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + data = [ + '{"time": 100, "event_class": "event_class1", "event_type": 1}', + '{"time": 105, "event_class": "event_class2", "event_type": 2}', + '{"time": 110, "event_class": "event_class2", "event_type": 3}', + '{"time": 116, "event_class": "event_class2", "event_type": 4}' + ] + + self.write_stream(data) + expected = ['{"event_type":2}'] + assert self.read_stream(len(expected), topic_path=self.output_topic) == expected + + stop_yds_query(client, query_id) + + issues = str(client.describe_query(query_id).result.query.transient_issue) + assert "Row dispatcher will use the predicate: WHERE `event_class` =" in issues, "Incorrect Issues: " + issues + + @yq_v1 + def test_start_new_query(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_start_new_query", create_output=False) + + output_topic1 = "pq_test_pq_read_write_output1" + output_topic2 = "pq_test_pq_read_write_output2" + output_topic3 = "pq_test_pq_read_write_output3" + create_stream(output_topic1, partitions_count=1) + create_read_rule(output_topic1, self.consumer_name) + + create_stream(output_topic2, partitions_count=1) + create_read_rule(output_topic2, self.consumer_name) + + create_stream(output_topic3, partitions_count=1) + create_read_rule(output_topic3, self.consumer_name) + + sql1 = Rf''' + INSERT INTO {YDS_CONNECTION}.`{output_topic1}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL, data String NOT NULL));''' + sql2 = Rf''' + INSERT INTO {YDS_CONNECTION}.`{output_topic2}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL, data String NOT NULL));''' + query_id1 = start_yds_query(kikimr, client, sql1) + query_id2 = start_yds_query(kikimr, client, sql2) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + data = [ + '{"time": 101, "data": "hello1", "event": "event1"}', + '{"time": 102, "data": "hello2", "event": "event2"}', + ] + + self.write_stream(data) + expected = ['101', '102'] + assert self.read_stream(len(expected), topic_path=output_topic1) == expected + assert self.read_stream(len(expected), topic_path=output_topic2) == expected + + wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 2) + + # nothing unnecessary... + assert not read_stream(output_topic1, 1, True, self.consumer_name, timeout=1) + assert not read_stream(output_topic2, 1, True, self.consumer_name, timeout=1) + + sql3 = Rf''' + INSERT INTO {YDS_CONNECTION}.`{output_topic3}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL, data String NOT NULL));''' + query_id3 = start_yds_query(kikimr, client, sql3) + + data = [ + '{"time": 103, "data": "hello3", "event": "event3"}', + '{"time": 104, "data": "hello4", "event": "event4"}', + ] + + self.write_stream(data) + expected = ['103', '104'] + + assert self.read_stream(len(expected), topic_path=output_topic1) == expected + assert self.read_stream(len(expected), topic_path=output_topic2) == expected + assert self.read_stream(len(expected), topic_path=output_topic3) == expected + + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + assert not read_stream(output_topic1, 1, True, self.consumer_name, timeout=1) + assert not read_stream(output_topic2, 1, True, self.consumer_name, timeout=1) + assert not read_stream(output_topic3, 1, True, self.consumer_name, timeout=1) + + stop_yds_query(client, query_id1) + stop_yds_query(client, query_id2) + stop_yds_query(client, query_id3) + + # Assert that all read rules were removed after query stops + read_rules = list_read_rules(self.input_topic) + assert len(read_rules) == 0, read_rules + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + @yq_v1 + def test_stop_start(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_stop_start", create_output=False) + + output_topic = "test_stop_start" + create_stream(output_topic, partitions_count=1) + create_read_rule(output_topic, self.consumer_name) + + sql1 = Rf''' + INSERT INTO {YDS_CONNECTION}.`{output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL));''' + + query_id = start_yds_query(kikimr, client, sql1) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + data = ['{"time": 101}', '{"time": 102}'] + self.write_stream(data) + expected = ['101', '102'] + assert self.read_stream(len(expected), topic_path=output_topic) == expected + + kikimr.compute_plane.wait_completed_checkpoints( + query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 + ) + stop_yds_query(client, query_id) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + client.modify_query( + query_id, + "continue", + sql1, + type=fq.QueryContent.QueryType.STREAMING, + state_load_mode=fq.StateLoadMode.EMPTY, + streaming_disposition=StreamingDisposition.from_last_checkpoint(), + ) + client.wait_query_status(query_id, fq.QueryMeta.RUNNING) + + data = ['{"time": 103}', '{"time": 104}'] + + self.write_stream(data) + expected = ['103', '104'] + assert self.read_stream(len(expected), topic_path=output_topic) == expected + + stop_yds_query(client, query_id) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + @yq_v1 + def test_stop_start_with_filter(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_stop_start", create_output=False) + + output_topic = "test_stop_start" + create_stream(output_topic, partitions_count=1) + create_read_rule(output_topic, self.consumer_name) + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time UInt64 NOT NULL)) + WHERE time > 200UL;''' + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + data = ['{"time": 101}', '{"time": 102}'] + self.write_stream(data) + + kikimr.compute_plane.wait_completed_checkpoints( + query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 10 + ) + stop_yds_query(client, query_id) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time UInt64 NOT NULL));''' + + client.modify_query( + query_id, + "continue", + sql, + type=fq.QueryContent.QueryType.STREAMING, + state_load_mode=fq.StateLoadMode.EMPTY, + streaming_disposition=StreamingDisposition.from_last_checkpoint(), + ) + client.wait_query_status(query_id, fq.QueryMeta.RUNNING) + + data = ['{"time": 203}', '{"time": 204}'] + self.write_stream(data) + expected = ['203', '204'] + assert self.read_stream(len(expected), topic_path=output_topic) == expected + + stop_yds_query(client, query_id) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + @yq_v1 + def test_restart_compute_node(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_restart_compute_node") + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL));''' + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + data = ['{"time": 101, "data": "hello1"}', '{"time": 102, "data": "hello2"}'] + + self.write_stream(data) + expected = ['101', '102'] + assert self.read_stream(len(expected), topic_path=self.output_topic) == expected + + kikimr.compute_plane.wait_completed_checkpoints( + query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 + ) + + wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 1) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + node_index = 2 + logging.debug("Restart compute node {}".format(node_index)) + kikimr.compute_plane.kikimr_cluster.nodes[node_index].stop() + kikimr.compute_plane.kikimr_cluster.nodes[node_index].start() + kikimr.compute_plane.wait_bootstrap(node_index) + + data = ['{"time": 103, "data": "hello3"}', '{"time": 104, "data": "hello4"}'] + self.write_stream(data) + expected = ['103', '104'] + assert self.read_stream(len(expected), topic_path=self.output_topic) == expected + kikimr.compute_plane.wait_completed_checkpoints( + query_id, kikimr.compute_plane.get_completed_checkpoints(query_id) + 1 + ) + + node_index = 1 + logging.debug("Restart compute node {}".format(node_index)) + kikimr.control_plane.kikimr_cluster.nodes[node_index].stop() + kikimr.control_plane.kikimr_cluster.nodes[node_index].start() + kikimr.control_plane.wait_bootstrap(node_index) + + data = ['{"time": 105, "data": "hello5"}', '{"time": 106, "data": "hello6"}'] + self.write_stream(data) + expected = ['105', '106'] + assert self.read_stream(len(expected), topic_path=self.output_topic) == expected + + stop_yds_query(client, query_id) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + @yq_v1 + def test_3_sessions(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_3_session", create_output=False) + + output_topic1 = "test_3_session1" + output_topic2 = "test_3_session2" + output_topic3 = "test_3_session3" + create_stream(output_topic1, partitions_count=1) + create_read_rule(output_topic1, self.consumer_name) + + create_stream(output_topic2, partitions_count=1) + create_read_rule(output_topic2, self.consumer_name) + + create_stream(output_topic3, partitions_count=1) + create_read_rule(output_topic3, self.consumer_name) + + sql1 = Rf''' + INSERT INTO {YDS_CONNECTION}.`{output_topic1}` + SELECT Unwrap(Json::SerializeJson(Yson::From(TableRow()))) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL));''' + sql2 = Rf''' + INSERT INTO {YDS_CONNECTION}.`{output_topic2}` + SELECT Unwrap(Json::SerializeJson(Yson::From(TableRow()))) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL));''' + + sql3 = Rf''' + INSERT INTO {YDS_CONNECTION}.`{output_topic3}` + SELECT Unwrap(Json::SerializeJson(Yson::From(TableRow()))) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL));''' + query_id1 = start_yds_query(kikimr, client, sql1) + query_id2 = start_yds_query(kikimr, client, sql2) + query_id3 = start_yds_query(kikimr, client, sql3) + + data = ['{"time":101}', '{"time":102}'] + + self.write_stream(data) + expected = data + assert self.read_stream(len(expected), topic_path=output_topic1) == expected + assert self.read_stream(len(expected), topic_path=output_topic2) == expected + assert self.read_stream(len(expected), topic_path=output_topic3) == expected + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + kikimr.compute_plane.wait_completed_checkpoints( + query_id1, kikimr.compute_plane.get_completed_checkpoints(query_id1) + 1 + ) + stop_yds_query(client, query_id1) + + data = ['{"time":103}', '{"time":104}'] + self.write_stream(data) + expected = data + assert not read_stream(output_topic1, 1, True, self.consumer_name, timeout=1) + assert self.read_stream(len(expected), topic_path=output_topic2) == expected + assert self.read_stream(len(expected), topic_path=output_topic3) == expected + + client.modify_query( + query_id1, + "continue", + sql1, + type=fq.QueryContent.QueryType.STREAMING, + state_load_mode=fq.StateLoadMode.EMPTY, + streaming_disposition=StreamingDisposition.from_last_checkpoint(), + ) + client.wait_query_status(query_id1, fq.QueryMeta.RUNNING) + + assert self.read_stream(len(expected), topic_path=output_topic1) == expected + + data = ['{"time":105}', '{"time":106}'] + self.write_stream(data) + expected = data + assert self.read_stream(len(expected), topic_path=output_topic1) == expected + assert self.read_stream(len(expected), topic_path=output_topic2) == expected + assert self.read_stream(len(expected), topic_path=output_topic3) == expected + + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + stop_yds_query(client, query_id1) + stop_yds_query(client, query_id2) + stop_yds_query(client, query_id3) + + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + @yq_v1 + def test_many_partitions(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_simple_not_null", partitions_count=4) + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL));''' + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 4) + + input_messages1 = [Rf'''{{"time": {c}}}''' for c in range(100, 110)] + write_stream(self.input_topic, input_messages1, "partition_key1") + + input_messages2 = [Rf'''{{"time": {c}}}''' for c in range(110, 120)] + write_stream(self.input_topic, input_messages2, "partition_key2") + + input_messages3 = [Rf'''{{"time": {c}}}''' for c in range(120, 130)] + write_stream(self.input_topic, input_messages3, "partition_key3") + + input_messages4 = [Rf'''{{"time": {c}}}''' for c in range(130, 140)] + write_stream(self.input_topic, input_messages4, "partition_key4") + + expected = [Rf'''{c}''' for c in range(100, 140)] + assert sorted(self.read_stream(len(expected), topic_path=self.output_topic)) == expected + + stop_yds_query(client, query_id) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + + @yq_v1 + def test_sensors(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_sensors") + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL));''' + + query_id = start_yds_query(kikimr, client, sql) + + self.write_stream(['{"time": 101}']) + assert self.read_stream(1, topic_path=self.output_topic) == ['101'] + + wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 1) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + wait_row_dispatcher_sensor_value(kikimr, "ClientsCount", 1) + wait_row_dispatcher_sensor_value(kikimr, "RowsSent", 1, exact_match=False) + wait_row_dispatcher_sensor_value(kikimr, "IncomingRequests", 1, exact_match=False) + wait_row_dispatcher_sensor_value(kikimr, "RowsRead", 1, exact_match=False) + + stop_yds_query(client, query_id) + + wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 0) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + wait_row_dispatcher_sensor_value(kikimr, "ClientsCount", 0) diff --git a/ydb/tests/fq/yds/ya.make b/ydb/tests/fq/yds/ya.make index 55ff7b3e3ad7..90dd2a544cbe 100644 --- a/ydb/tests/fq/yds/ya.make +++ b/ydb/tests/fq/yds/ya.make @@ -42,6 +42,7 @@ TEST_SRCS( test_recovery_match_recognize.py test_recovery_mz.py test_restart_query.py + test_row_dispatcher.py test_select_1.py test_select_limit_db_id.py test_select_limit.py diff --git a/ydb/tests/tools/fq_runner/fq_client.py b/ydb/tests/tools/fq_runner/fq_client.py index 001b7cf3788d..7043657cf6ab 100644 --- a/ydb/tests/tools/fq_runner/fq_client.py +++ b/ydb/tests/tools/fq_runner/fq_client.py @@ -405,7 +405,7 @@ def create_ydb_connection(self, name, database_id, @retry.retry_intrusive def create_yds_connection(self, name, database=None, endpoint=None, database_id=None, visibility=fq.Acl.Visibility.PRIVATE, auth_method=AuthMethod.no_auth(), - check_issues=True): + check_issues=True, shared_reading=False): assert (database_id is not None and database is None and endpoint is None) or ( database_id is None and database is not None and endpoint is not None) request = fq.CreateConnectionRequest() @@ -417,6 +417,8 @@ def create_yds_connection(self, name, database=None, endpoint=None, database_id= yds.database = database yds.endpoint = endpoint + yds.shared_reading = shared_reading + yds.auth.CopyFrom(auth_method) request.content.acl.visibility = visibility return self.create_connection(request, check_issues) diff --git a/ydb/tests/tools/fq_runner/kikimr_runner.py b/ydb/tests/tools/fq_runner/kikimr_runner.py index d0480e8dd533..c343cf112093 100644 --- a/ydb/tests/tools/fq_runner/kikimr_runner.py +++ b/ydb/tests/tools/fq_runner/kikimr_runner.py @@ -117,6 +117,7 @@ def enable_logs(self): self.enable_logging("FQ_QUOTA_PROXY") self.enable_logging("PUBLIC_HTTP") self.enable_logging("FQ_CONTROL_PLANE_CONFIG") + self.enable_logging("FQ_ROW_DISPATCHER", LogLevels.TRACE) # self.enable_logging("GRPC_SERVER") @abc.abstractclassmethod @@ -363,7 +364,7 @@ def wait_completed_checkpoints(self, query_id, checkpoints_count, completed = self.get_completed_checkpoints(query_id, expect_counters_exist=expect_counters_exist) if completed >= checkpoints_count: break - assert time.time() < deadline, "Wait zero checkpoint failed" + assert time.time() < deadline, "Wait zero checkpoint failed, actual completed: " + str(completed) time.sleep(yatest_common.plain_or_under_sanitizer(0.5, 2)) def wait_zero_checkpoint(self, query_id, timeout=yatest_common.plain_or_under_sanitizer(30, 150), @@ -514,6 +515,17 @@ def fill_config(self, control_plane): self.fill_storage_config(fq_config['checkpoint_coordinator']['storage'], "CheckpointCoordinatorStorage_" + self.uuid) + fq_config['row_dispatcher'] = { + 'enabled': True, + 'timeout_before_start_session_sec': 2, + 'send_status_period_sec': 2, + 'max_session_used_memory': 1000000, + 'without_consumer': True} + fq_config['row_dispatcher']['coordinator'] = {'coordination_node_path': "row_dispatcher"} + fq_config['row_dispatcher']['coordinator']['database'] = {} + self.fill_storage_config(fq_config['row_dispatcher']['coordinator']['database'], + "RowDispatcher_" + self.uuid) + fq_config['quotas_manager'] = {'enabled': True} fq_config['rate_limiter'] = {'enabled': True} From 0da0d6abdfa632ec400a7da3b2cb09b861f464e2 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 3 Oct 2024 09:49:00 +0300 Subject: [PATCH 37/56] YQ-3583 Improvements after load tests / to stable (#10019) --- .../fq/libs/row_dispatcher/topic_session.cpp | 9 ++++++-- .../row_dispatcher/ut/topic_session_ut.cpp | 21 +++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/ydb/core/fq/libs/row_dispatcher/topic_session.cpp b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp index 9623806ee87a..a1dc73bfef63 100644 --- a/ydb/core/fq/libs/row_dispatcher/topic_session.cpp +++ b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp @@ -92,6 +92,7 @@ struct TEvPrivate { ui64 PrintStatePeriodSec = 60; ui64 MaxBatchSizeBytes = 10000000; +ui64 MaxHandledEvents = 1000; TVector GetVector(const google::protobuf::RepeatedPtrField& value) { return {value.begin(), value.end()}; @@ -227,6 +228,7 @@ class TTopicSession : public TActorBootstrapped { cFunc(NActors::TEvents::TEvPoisonPill::EventType, PassAway); IgnoreFunc(NFq::TEvPrivate::TEvPqEventsReady); IgnoreFunc(NFq::TEvPrivate::TEvCreateSession); + IgnoreFunc(NFq::TEvPrivate::TEvDataParsed); IgnoreFunc(NFq::TEvPrivate::TEvDataAfterFilteration); IgnoreFunc(NFq::TEvPrivate::TEvStatus); IgnoreFunc(NFq::TEvPrivate::TEvDataFiltered); @@ -442,7 +444,7 @@ void TTopicSession::Handle(TEvRowDispatcher::TEvGetNextBatch::TPtr& ev) { } void TTopicSession::HandleNewEvents() { - while (true) { + for (ui64 i = 0; i < MaxHandledEvents; ++i) { if (!ReadSession) { return; } @@ -475,7 +477,6 @@ void TTopicSession::TTopicEventProcessor::operator()(NYdb::NTopic::TReadSessionE LOG_ROW_DISPATCHER_TRACE("Data received: " << message.DebugString(true)); TString item = message.GetData(); - item.Detach(); Self.SendToParsing(message.GetOffset(), item); Self.LastMessageOffset = message.GetOffset(); } @@ -540,6 +541,10 @@ void TTopicSession::SendToParsing(ui64 offset, const TString& message) { } } + if (ClientsWithoutPredicate.size() == Clients.size()) { + return; + } + try { Parser->Push(offset, message); } catch (const std::exception& e) { diff --git a/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp index ba24378e0a35..65c24fcb85f1 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp @@ -171,6 +171,27 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { StopSession(ReadActorId2, source); } + Y_UNIT_TEST_F(TwoSessionWithoutPredicate, TFixture) { + const TString topicName = "twowithoutpredicate"; + PQCreateStream(topicName); + Init(topicName); + auto source1 = BuildSource(topicName, true); + auto source2 = BuildSource(topicName, true); + StartSession(ReadActorId1, source1); + StartSession(ReadActorId2, source2); + + const std::vector data = { Json1 }; + PQWrite(data, topicName); + ExpectNewDataArrived({ReadActorId1, ReadActorId2}); + Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); + Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); + ExpectMessageBatch(ReadActorId1, { Json1 }); + ExpectMessageBatch(ReadActorId2, { Json1 }); + + StopSession(ReadActorId1, source1); + StopSession(ReadActorId2, source2); + } + Y_UNIT_TEST_F(SessionWithPredicateAndSessionWithoutPredicate, TFixture) { const TString topicName = "topic2"; PQCreateStream(topicName); From a8045bfc42edcc0485d2b25570736bf1772958a8 Mon Sep 17 00:00:00 2001 From: Vitaly Isaev Date: Tue, 8 Oct 2024 23:54:49 +0300 Subject: [PATCH 38/56] Merge #9778 #10161 (#10202) --- .../generic_ut/kqp_generic_provider_ut.cpp | 4 ++++ .../actors/ut/yql_generic_lookup_actor_ut.cpp | 1 + .../generic/actors/yql_generic_lookup_actor.cpp | 1 + .../generic/actors/yql_generic_read_actor.cpp | 1 + .../connector/api/service/protos/connector.proto | 16 ++++++++++++++++ .../libcpp/ut_helpers/connector_client_mock.h | 1 + .../datasource/clickhouse/docker-compose.yml | 2 +- .../datasource/postgresql/docker-compose.yml | 2 +- .../tests/datasource/ydb/docker-compose.yml | 2 +- .../connector/tests/join/docker-compose.yml | 2 +- 10 files changed, 28 insertions(+), 4 deletions(-) diff --git a/ydb/core/kqp/ut/federated_query/generic_ut/kqp_generic_provider_ut.cpp b/ydb/core/kqp/ut/federated_query/generic_ut/kqp_generic_provider_ut.cpp index 175718089600..c461fe68dfac 100644 --- a/ydb/core/kqp/ut/federated_query/generic_ut/kqp_generic_provider_ut.cpp +++ b/ydb/core/kqp/ut/federated_query/generic_ut/kqp_generic_provider_ut.cpp @@ -125,6 +125,7 @@ namespace NKikimr::NKqp { // step 3: ReadSplits std::vector colData = {10, 20, 30, 40, 50}; clientMock->ExpectReadSplits() + .Filtering(NYql::NConnector::NApi::TReadSplitsRequest::FILTERING_OPTIONAL) .Split() .Description("some binary description") .Select() @@ -221,6 +222,7 @@ namespace NKikimr::NKqp { // step 3: ReadSplits clientMock->ExpectReadSplits() + .Filtering(NYql::NConnector::NApi::TReadSplitsRequest::FILTERING_OPTIONAL) .Split() .Description("some binary description") .Select() @@ -313,6 +315,7 @@ namespace NKikimr::NKqp { // step 3: ReadSplits clientMock->ExpectReadSplits() + .Filtering(NYql::NConnector::NApi::TReadSplitsRequest::FILTERING_OPTIONAL) .Split() .Description("some binary description") .Select() @@ -418,6 +421,7 @@ namespace NKikimr::NKqp { std::vector filterColumnData = {42, 24}; // clang-format off clientMock->ExpectReadSplits() + .Filtering(NYql::NConnector::NApi::TReadSplitsRequest::FILTERING_OPTIONAL) .Split() .Description("some binary description") .Select(select) diff --git a/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp b/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp index 48c5b2951098..d7b77b8bbda6 100644 --- a/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp +++ b/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp @@ -128,6 +128,7 @@ Y_UNIT_TEST_SUITE(GenericProviderLookupActor) { connectorMock->ExpectReadSplits() .DataSourceInstance(dsi) + .Filtering(NYql::NConnector::NApi::TReadSplitsRequest::FILTERING_MANDATORY) .Split() .Description("Actual split info is not important") .Done() diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp b/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp index 18374715fa32..21090f61e5b8 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp +++ b/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp @@ -162,6 +162,7 @@ namespace NYql::NDq { *readRequest.add_splits() = split; readRequest.Setformat(NConnector::NApi::TReadSplitsRequest_EFormat::TReadSplitsRequest_EFormat_ARROW_IPC_STREAMING); + readRequest.set_filtering(NConnector::NApi::TReadSplitsRequest::FILTERING_MANDATORY); Connector->ReadSplits(readRequest).Subscribe([actorSystem = TActivationContext::ActorSystem(), selfId = SelfId()](const NConnector::TReadSplitsStreamIteratorAsyncResult& asyncResult) { YQL_CLOG(DEBUG, ProviderGeneric) << "ActorId=" << selfId << " Got ReadSplitsStreamIterator from Connector"; auto result = ExtractFromConstFuture(asyncResult); diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp b/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp index 93b3ba116adb..4c664f665826 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp +++ b/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp @@ -199,6 +199,7 @@ namespace NYql::NDq { // Prepare request NConnector::NApi::TReadSplitsRequest request; request.set_format(NConnector::NApi::TReadSplitsRequest::ARROW_IPC_STREAMING); + request.set_filtering(NConnector::NApi::TReadSplitsRequest::FILTERING_OPTIONAL); request.mutable_splits()->Reserve(Splits_.size()); for (const auto& split : Splits_) { diff --git a/ydb/library/yql/providers/generic/connector/api/service/protos/connector.proto b/ydb/library/yql/providers/generic/connector/api/service/protos/connector.proto index 7004f2686136..190ca063a8cd 100644 --- a/ydb/library/yql/providers/generic/connector/api/service/protos/connector.proto +++ b/ydb/library/yql/providers/generic/connector/api/service/protos/connector.proto @@ -235,6 +235,22 @@ message TReadSplitsRequest { // If empty, the connector will return the split data from the very beginning. TContinuation continuation = 6; + enum EFiltering { + FILTERING_UNSPECIFIED = 0; + // If Connector cannot push down the predicate to the data source completely + // (due to the lack of data type support, for example), it doesn't apply filter at all + // and returns the full result of `SELECT columns FROM table` (no WHERE clause). + // It's YDB's duty to filter the output on its own side. + FILTERING_OPTIONAL = 1; + // If Connector cannot push down the predicate to the data source completely, + // it terminates the request and returns an error. + FILTERING_MANDATORY = 2; + } + + // Determines various modes of server behavior in the context of predicate pushdown. + // If not set, the default value is `FILTERING_OPTIONAL`. + EFiltering filtering = 7; + reserved 5; } diff --git a/ydb/library/yql/providers/generic/connector/libcpp/ut_helpers/connector_client_mock.h b/ydb/library/yql/providers/generic/connector/libcpp/ut_helpers/connector_client_mock.h index ae92f1c58631..7f764611593a 100644 --- a/ydb/library/yql/providers/generic/connector/libcpp/ut_helpers/connector_client_mock.h +++ b/ydb/library/yql/providers/generic/connector/libcpp/ut_helpers/connector_client_mock.h @@ -745,6 +745,7 @@ namespace NYql::NConnector::NTest { DATA_SOURCE_INSTANCE_SUBBUILDER(); SUBPROTO_BUILDER(Split, add_splits, NApi::TSplit, TSplitBuilder); SETTER(Format, format); + SETTER(Filtering, filtering); TReadSplitsResultBuilder Result() { return TReadSplitsResultBuilder(ResponseResult_, this); diff --git a/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml index c8fb08a8927f..d79b229395e3 100644 --- a/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/datasource/clickhouse/docker-compose.yml @@ -12,7 +12,7 @@ services: - 8123 fq-connector-go: container_name: fq-tests-ch-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.5.7-rc.1@sha256:f12475f346105d7bc630e7b85f51dce980bf9833f6ce0625c6f1191b1a1de923 + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.11-rc.1@sha256:c018b2f1151fac8a86aaf79950ccdffaa72150785f303096d9a466f50eb62498 ports: - 2130 volumes: diff --git a/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml index 5069925061a1..d7a98b01993c 100644 --- a/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/datasource/postgresql/docker-compose.yml @@ -1,7 +1,7 @@ services: fq-connector-go: container_name: fq-tests-pg-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.5.7-rc.1@sha256:f12475f346105d7bc630e7b85f51dce980bf9833f6ce0625c6f1191b1a1de923 + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.11-rc.1@sha256:c018b2f1151fac8a86aaf79950ccdffaa72150785f303096d9a466f50eb62498 ports: - 2130 volumes: diff --git a/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml index baf6c9e4ab3e..6282e6653185 100644 --- a/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/datasource/ydb/docker-compose.yml @@ -5,7 +5,7 @@ services: echo \"$$(dig fq-tests-ydb-ydb +short) fq-tests-ydb-ydb\" >> /etc/hosts; cat /etc/hosts; /opt/ydb/bin/fq-connector-go server -c /opt/ydb/cfg/fq-connector-go.yaml" container_name: fq-tests-ydb-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.5.7-rc.1@sha256:f12475f346105d7bc630e7b85f51dce980bf9833f6ce0625c6f1191b1a1de923 + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.11-rc.1@sha256:c018b2f1151fac8a86aaf79950ccdffaa72150785f303096d9a466f50eb62498 ports: - 2130 volumes: diff --git a/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml b/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml index daf28b70f920..6d7996d7e149 100644 --- a/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml +++ b/ydb/library/yql/providers/generic/connector/tests/join/docker-compose.yml @@ -12,7 +12,7 @@ services: - 8123 fq-connector-go: container_name: fq-tests-join-fq-connector-go - image: ghcr.io/ydb-platform/fq-connector-go:v0.5.7-rc.1@sha256:f12475f346105d7bc630e7b85f51dce980bf9833f6ce0625c6f1191b1a1de923 + image: ghcr.io/ydb-platform/fq-connector-go:v0.5.11-rc.1@sha256:c018b2f1151fac8a86aaf79950ccdffaa72150785f303096d9a466f50eb62498 ports: - 2130 volumes: From eaa40b783024c71298d48873c2aef6c74bb60aef Mon Sep 17 00:00:00 2001 From: Alexey Pozdniakov Date: Wed, 9 Oct 2024 13:08:15 +0300 Subject: [PATCH 39/56] YQ-3617: fix GROUP BY HOP + AS_TABLE (#9370) (#10200) --- .../yql/core/common_opt/yql_co_simple1.cpp | 99 +++ .../yql/core/ut/yql_expr_constraint_ut.cpp | 29 + ydb/library/yql/core/ya.make | 1 + ydb/library/yql/core/yql_expr_constraint.cpp | 23 + ydb/library/yql/core/yql_opt_hopping.cpp | 569 ++++++++++++++++++ ydb/library/yql/core/yql_opt_hopping.h | 64 ++ ydb/library/yql/dq/opt/dq_opt_hopping.cpp | 568 +---------------- .../sql/dq_file/part10/canondata/result.json | 28 + .../sql/dq_file/part18/canondata/result.json | 28 + .../hybrid_file/part2/canondata/result.json | 12 +- .../tests/sql/sql2yql/canondata/result.json | 28 + .../suites/aggregate/group_by_hop_static.sql | 26 + .../group_by_hop_static_list_key.sql | 26 + .../part19/canondata/result.json | 6 +- .../part6/canondata/result.json | 6 +- 15 files changed, 935 insertions(+), 578 deletions(-) create mode 100644 ydb/library/yql/core/yql_opt_hopping.cpp create mode 100644 ydb/library/yql/core/yql_opt_hopping.h create mode 100644 ydb/library/yql/tests/sql/suites/aggregate/group_by_hop_static.sql create mode 100644 ydb/library/yql/tests/sql/suites/aggregate/group_by_hop_static_list_key.sql diff --git a/ydb/library/yql/core/common_opt/yql_co_simple1.cpp b/ydb/library/yql/core/common_opt/yql_co_simple1.cpp index 20246c67e946..4ac12be226ce 100644 --- a/ydb/library/yql/core/common_opt/yql_co_simple1.cpp +++ b/ydb/library/yql/core/common_opt/yql_co_simple1.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -3228,6 +3229,99 @@ TExprNode::TPtr RemoveDeadPayloadColumns(const TCoAggregate& aggr, TExprContext& return aggr.Ptr(); } +TExprNode::TPtr RewriteAsHoppingWindowFullOutput(const TCoAggregate& aggregate, TExprContext& ctx) { + const auto pos = aggregate.Pos(); + + NHopping::EnsureNotDistinct(aggregate); + + const auto maybeHopTraits = NHopping::ExtractHopTraits(aggregate, ctx, false); + if (!maybeHopTraits) { + return nullptr; + } + const auto hopTraits = *maybeHopTraits; + + const auto aggregateInputType = GetSeqItemType(*aggregate.Ptr()->Head().GetTypeAnn()).Cast(); + NHopping::TKeysDescription keysDescription(*aggregateInputType, aggregate.Keys(), hopTraits.Column); + + const auto keyLambda = keysDescription.GetKeySelector(ctx, pos, aggregateInputType); + const auto timeExtractorLambda = NHopping::BuildTimeExtractor(hopTraits.Traits, ctx); + const auto initLambda = NHopping::BuildInitHopLambda(aggregate, ctx); + const auto updateLambda = NHopping::BuildUpdateHopLambda(aggregate, ctx); + const auto saveLambda = NHopping::BuildSaveHopLambda(aggregate, ctx); + const auto loadLambda = NHopping::BuildLoadHopLambda(aggregate, ctx); + const auto mergeLambda = NHopping::BuildMergeHopLambda(aggregate, ctx); + const auto finishLambda = NHopping::BuildFinishHopLambda(aggregate, keysDescription.GetActualGroupKeys(), hopTraits.Column, ctx); + + const auto streamArg = Build(ctx, pos).Name("stream").Done(); + auto multiHoppingCoreBuilder = Build(ctx, pos) + .KeyExtractor(keyLambda) + .TimeExtractor(timeExtractorLambda) + .Hop(hopTraits.Traits.Hop()) + .Interval(hopTraits.Traits.Interval()) + .Delay(hopTraits.Traits.Delay()) + .DataWatermarks(hopTraits.Traits.DataWatermarks()) + .InitHandler(initLambda) + .UpdateHandler(updateLambda) + .MergeHandler(mergeLambda) + .FinishHandler(finishLambda) + .SaveHandler(saveLambda) + .LoadHandler(loadLambda) + .template WatermarkMode().Build(ToString(false)); + + return Build(ctx, pos) + .Input(aggregate.Input()) + .KeySelectorLambda(keyLambda) + .SortDirections() + .Literal() + .Value("true") + .Build() + .Build() + .SortKeySelectorLambda(timeExtractorLambda) + .ListHandlerLambda() + .Args(streamArg) + .template Body() + .Stream(Build(ctx, pos) + .Input(multiHoppingCoreBuilder + .template Input() + .List(streamArg) + .Build() + .Done()) + .Lambda(keysDescription.BuildUnpickleLambda(ctx, pos, *aggregateInputType)) + .Done()) + .Build() + .Build() + .Done() + .Ptr(); +} + +TExprNode::TPtr RewriteAsHoppingWindow(TExprNode::TPtr node, TExprContext& ctx) { + const auto aggregate = TCoAggregate(node); + + if (!IsPureIsolatedLambda(*aggregate.Ptr())) { + return nullptr; + } + + if (!GetSetting(aggregate.Settings().Ref(), "hopping")) { + return nullptr; + } + + auto result = RewriteAsHoppingWindowFullOutput(aggregate, ctx); + if (!result) { + return result; + } + + auto outputColumnSetting = GetSetting(aggregate.Settings().Ref(), "output_columns"); + if (!outputColumnSetting) { + return result; + } + + return Build(ctx, aggregate.Pos()) + .Input(result) + .Members(outputColumnSetting->ChildPtr(1)) + .Done() + .Ptr(); +} + TExprNode::TPtr PullAssumeColumnOrderOverEquiJoin(const TExprNode::TPtr& node, TExprContext& ctx, TOptimizeContext& optCtx) { TVector withAssume; for (ui32 i = 0; i < node->ChildrenSize() - 2; i++) { @@ -5007,6 +5101,11 @@ void RegisterCoSimpleCallables1(TCallableOptimizerMap& map) { return clean; } + if (auto hopping = RewriteAsHoppingWindow(node, ctx)) { + YQL_CLOG(DEBUG, Core) << "RewriteAsHoppingWindow"; + return hopping; + } + return DropReorder(node, ctx); }; diff --git a/ydb/library/yql/core/ut/yql_expr_constraint_ut.cpp b/ydb/library/yql/core/ut/yql_expr_constraint_ut.cpp index 2a44fef51a89..df3fdef8e5b3 100644 --- a/ydb/library/yql/core/ut/yql_expr_constraint_ut.cpp +++ b/ydb/library/yql/core/ut/yql_expr_constraint_ut.cpp @@ -3270,6 +3270,35 @@ Y_UNIT_TEST_SUITE(TYqlExprConstraints) { CheckConstraint(exprRoot, "LazyList", ""); CheckConstraint(exprRoot, "LazyList", ""); } + + Y_UNIT_TEST(GroupByHop) { + const TStringBuf s = R"(( +(let list (AsList + (AsStruct '('"time" (String '"2024-01-01T00:00:01Z")) '('"user" (Int32 '"1")) '('"data" (Null))) + (AsStruct '('"time" (String '"2024-01-01T00:00:02Z")) '('"user" (Int32 '"1")) '('"data" (Null))) + (AsStruct '('"time" (String '"2024-01-01T00:00:03Z")) '('"user" (Int32 '"1")) '('"data" (Null))) +)) +(let input (FlatMap list (lambda '(row) (Just (AsStruct '('"data" (Member row '"data")) '('group0 (AsList (Member row '"user"))) '('"time" (Member row '"time")) '('"user" (Member row '"user"))))))) +(let keySelector (lambda '(row) '((StablePickle (Member row '"data")) (StablePickle (Member row 'group0))))) +(let sortKeySelector (lambda '(row) (SafeCast (Member row '"time") (OptionalType (DataType 'Timestamp))))) +(let res (PartitionsByKeys input keySelector (Bool 'true) sortKeySelector (lambda '(row) (block '( + (let interval (Interval '1000000)) + (let map (lambda '(item) (AsStruct))) + (let reduce (lambda '(lhs rhs) (AsStruct))) + (let hopping (MultiHoppingCore (Iterator row) keySelector sortKeySelector interval interval interval 'true map reduce map map reduce (lambda '(key state time) (AsStruct '('_yql_time time) '('"data" (Nth key '"0")) '('group0 (Nth key '"1")))) '"0")) + (return (ForwardList (FlatMap hopping (lambda '(row) (Just (AsStruct '('_yql_time (Member row '_yql_time)) '('"data" (Unpickle (NullType) (Member row '"data"))) '('group0 (Unpickle (ListType (DataType 'Int32)) (Member row 'group0))))))))) +))))) + +(let res_sink (DataSink 'yt (quote plato))) +(let world (Write! world res_sink (Key '('table (String 'Output))) res '('('mode 'renew)))) +(return (Commit! world res_sink)) + ))"; + + TExprContext exprCtx; + const auto exprRoot = ParseAndAnnotate(s, exprCtx); + CheckConstraint(exprRoot, "PartitionsByKeys", "Distinct((data,group0))"); + CheckConstraint(exprRoot, "PartitionsByKeys", "Unique((data,group0))"); + } } } // namespace NYql diff --git a/ydb/library/yql/core/ya.make b/ydb/library/yql/core/ya.make index 02f4eb87ffaa..17edc9ef2822 100644 --- a/ydb/library/yql/core/ya.make +++ b/ydb/library/yql/core/ya.make @@ -28,6 +28,7 @@ SRCS( yql_join.cpp yql_join.h yql_library_compiler.cpp + yql_opt_hopping.cpp yql_opt_match_recognize.cpp yql_opt_match_recognize.h yql_opt_proposed_by_data.cpp diff --git a/ydb/library/yql/core/yql_expr_constraint.cpp b/ydb/library/yql/core/yql_expr_constraint.cpp index 604983d1d24a..feafb3bb6082 100644 --- a/ydb/library/yql/core/yql_expr_constraint.cpp +++ b/ydb/library/yql/core/yql_expr_constraint.cpp @@ -241,6 +241,9 @@ class TCallableConstraintTransformer : public TCallableTransformerBase; Functions["BlockMergeFinalizeHashed"] = &TCallableConstraintTransformer::AggregateWrap; Functions["BlockMergeManyFinalizeHashed"] = &TCallableConstraintTransformer::AggregateWrap; + Functions["MultiHoppingCore"] = &TCallableConstraintTransformer::MultiHoppingCoreWrap; + Functions["StablePickle"] = &TCallableConstraintTransformer::FromFirst; + Functions["Unpickle"] = &TCallableConstraintTransformer::FromSecond; } std::optional ProcessCore(const TExprNode::TPtr& input, TExprNode::TPtr& output, TExprContext& ctx) { @@ -2892,6 +2895,26 @@ class TCallableConstraintTransformer : public TCallableTransformerBaseChild(TCoMultiHoppingCore::idx_KeyExtractor); + const auto keys = GetPathsToKeys(keySelectorLambda->Tail(), keySelectorLambda->Head().Head()); + std::vector columns(keys.size()); + std::transform(keys.begin(), keys.end(), columns.begin(), [](const TPartOfConstraintBase::TPathType& path) -> std::string_view { + return path.front(); + }); + if (!columns.empty()) { + input->AddConstraint(ctx.MakeConstraint(columns)); + input->AddConstraint(ctx.MakeConstraint(columns)); + } + + return TStatus::Ok; + } + private: template static void CopyExcept(TConstraintContainer& dst, const TConstraintContainer& from, const TSet& except) { diff --git a/ydb/library/yql/core/yql_opt_hopping.cpp b/ydb/library/yql/core/yql_opt_hopping.cpp new file mode 100644 index 000000000000..2cfce04cfadc --- /dev/null +++ b/ydb/library/yql/core/yql_opt_hopping.cpp @@ -0,0 +1,569 @@ +#include "yql_opt_hopping.h" + +#include + +#include + +#include +#include +#include +#include + +using namespace NYql; +using namespace NYql::NNodes; + +namespace NYql::NHopping { + +TKeysDescription::TKeysDescription(const TStructExprType& rowType, const TCoAtomList& keys, const TString& hoppingColumn) { + for (const auto& key : keys) { + if (key.StringValue() == hoppingColumn) { + FakeKeys.emplace_back(key.StringValue()); + continue; + } + + const auto index = rowType.FindItem(key.StringValue()); + Y_ENSURE(index); + + auto itemType = rowType.GetItems()[*index]->GetItemType(); + if (RemoveOptionalType(itemType)->GetKind() == ETypeAnnotationKind::Data) { + MemberKeys.emplace_back(key.StringValue()); + continue; + } + + PickleKeys.emplace_back(key.StringValue()); + } +} + +TExprNode::TPtr TKeysDescription::BuildPickleLambda(TExprContext& ctx, TPositionHandle pos) const { + TCoArgument arg = Build(ctx, pos) + .Name("item") + .Done(); + + TExprBase body = arg; + + for (const auto& key : PickleKeys) { + const auto member = Build(ctx, pos) + .Name().Build(key) + .Struct(arg) + .Done() + .Ptr(); + + body = Build(ctx, pos) + .Struct(body) + .Name().Build(key) + .Item(ctx.NewCallable(pos, "StablePickle", { member })) + .Done(); + } + + return Build(ctx, pos) + .Args({arg}) + .Body(body) + .Done() + .Ptr(); +} + +TExprNode::TPtr TKeysDescription::BuildUnpickleLambda(TExprContext& ctx, TPositionHandle pos, const TStructExprType& rowType) { + TCoArgument arg = Build(ctx, pos) + .Name("item") + .Done(); + + TExprBase body = arg; + + for (const auto& key : PickleKeys) { + const auto index = rowType.FindItem(key); + Y_ENSURE(index); + + auto itemType = rowType.GetItems().at(*index)->GetItemType(); + const auto member = Build(ctx, pos) + .Name().Build(key) + .Struct(arg) + .Done() + .Ptr(); + + body = Build(ctx, pos) + .Struct(body) + .Name().Build(key) + .Item(ctx.NewCallable(pos, "Unpickle", { ExpandType(pos, *itemType, ctx), member })) + .Done(); + } + + return Build(ctx, pos) + .Args({arg}) + .Body(body) + .Done() + .Ptr(); +} + +TVector TKeysDescription::GetKeysList(TExprContext& ctx, TPositionHandle pos) const { + TVector res; + res.reserve(PickleKeys.size() + MemberKeys.size()); + + for (const auto& pickleKey : PickleKeys) { + res.emplace_back(Build(ctx, pos).Value(pickleKey).Done()); + } + for (const auto& memberKey : MemberKeys) { + res.emplace_back(Build(ctx, pos).Value(memberKey).Done()); + } + return res; +} + +TVector TKeysDescription::GetActualGroupKeys() const { + TVector result; + result.reserve(PickleKeys.size() + MemberKeys.size()); + result.insert(result.end(), PickleKeys.begin(), PickleKeys.end()); + result.insert(result.end(), MemberKeys.begin(), MemberKeys.end()); + return result; +} + +bool TKeysDescription::NeedPickle() const { + return !PickleKeys.empty(); +} + +TExprNode::TPtr TKeysDescription::GetKeySelector(TExprContext& ctx, TPositionHandle pos, const TStructExprType* rowType) { + auto builder = Build(ctx, pos); + for (auto key : GetKeysList(ctx, pos)) { + builder.Add(std::move(key)); + } + return BuildKeySelector(pos, *rowType, builder.Build().Value().Ptr(), ctx); +} + +TString BuildColumnName(const TExprBase& column) { + if (const auto columnName = column.Maybe()) { + return columnName.Cast().StringValue(); + } + + if (const auto columnNames = column.Maybe()) { + TStringBuilder columnNameBuilder; + for (const auto columnName : columnNames.Cast()) { + columnNameBuilder.append(columnName.StringValue()); + columnNameBuilder.append("_"); + } + return columnNameBuilder; + } + + YQL_ENSURE(false, "Invalid node. Expected Atom or AtomList, but received: " + << column.Ptr()->Dump()); +} + +bool IsLegacyHopping(const TExprNode::TPtr& hoppingSetting) { + return !hoppingSetting->Child(1)->IsList(); +} + +void EnsureNotDistinct(const TCoAggregate& aggregate) { + const auto& aggregateHandlers = aggregate.Handlers(); + + YQL_ENSURE( + AllOf(aggregateHandlers, [](const auto& t){ return !t.DistinctName(); }), + "Distinct is not supported for aggregation with hop"); +} + +TMaybe ExtractHopTraits(const TCoAggregate& aggregate, TExprContext& ctx, bool analyticsMode) { + const auto pos = aggregate.Pos(); + + const auto hopSetting = GetSetting(aggregate.Settings().Ref(), "hopping"); + if (!hopSetting) { + ctx.AddError(TIssue(ctx.GetPosition(pos), "Aggregate over stream must have 'hopping' setting")); + return Nothing(); + } + + const auto hoppingColumn = IsLegacyHopping(hopSetting) + ? "_yql_time" + : TString(hopSetting->Child(1)->Child(0)->Content()); + + const auto traitsNode = IsLegacyHopping(hopSetting) + ? hopSetting->Child(1) + : hopSetting->Child(1)->Child(1); + + const auto maybeTraits = TMaybeNode(traitsNode); + if (!maybeTraits) { + ctx.AddError(TIssue(ctx.GetPosition(pos), "Invalid 'hopping' setting in Aggregate")); + return Nothing(); + } + + const auto traits = maybeTraits.Cast(); + + const auto checkIntervalParam = [&] (TExprBase param) -> ui64 { + if (param.Maybe()) { + param = param.Cast().Input(); + } + if (!param.Maybe()) { + ctx.AddError(TIssue(ctx.GetPosition(pos), "Not an interval data ctor")); + return 0; + } + auto value = FromString(param.Cast().Literal().Value()); + if (value <= 0) { + ctx.AddError(TIssue(ctx.GetPosition(pos), "Interval value must be positive")); + return 0; + } + return (ui64)value; + }; + + const auto hop = checkIntervalParam(traits.Hop()); + if (!hop) { + return Nothing(); + } + const auto interval = checkIntervalParam(traits.Interval()); + if (!interval) { + return Nothing(); + } + const auto delay = checkIntervalParam(traits.Delay()); + if (!delay) { + return Nothing(); + } + + if (interval < hop) { + ctx.AddError(TIssue(ctx.GetPosition(pos), "Interval must be greater or equal then hop")); + return Nothing(); + } + if (delay < hop) { + ctx.AddError(TIssue(ctx.GetPosition(pos), "Delay must be greater or equal then hop")); + return Nothing(); + } + + const auto newTraits = Build(ctx, aggregate.Pos()) + .InitFrom(traits) + .DataWatermarks(analyticsMode + ? ctx.NewAtom(aggregate.Pos(), "false") + : traits.DataWatermarks().Ptr()) + .Done(); + + return THoppingTraits { + hoppingColumn, + newTraits, + hop, + interval, + delay + }; +} + +TExprNode::TPtr BuildTimeExtractor(const TCoHoppingTraits& hoppingTraits, TExprContext& ctx) { + const auto pos = hoppingTraits.Pos(); + + if (hoppingTraits.ItemType().Ref().GetTypeAnn()->Cast()->GetType()->Cast()->GetSize() == 0) { + // The case when no fields are used in lambda. F.e. when it has only DependsOn. + return ctx.DeepCopyLambda(hoppingTraits.TimeExtractor().Ref()); + } + + return Build(ctx, pos) + .Args({"item"}) + .Body() + .Apply(hoppingTraits.TimeExtractor()) + .With(0) + .Type(hoppingTraits.ItemType()) + .Value("item") + .Build() + .Build() + .Done() + .Ptr(); +} + +TExprNode::TPtr BuildInitHopLambda(const TCoAggregate& aggregate, TExprContext& ctx) { + const auto pos = aggregate.Pos(); + const auto& aggregateHandlers = aggregate.Handlers(); + + const auto initItemArg = Build(ctx, pos).Name("item").Done(); + + TVector structItems; + structItems.reserve(aggregateHandlers.Size()); + + ui32 index = 0; + for (const auto& handler : aggregateHandlers) { + const auto tuple = handler.Cast(); + + TMaybeNode applier; + if (tuple.Trait().Cast().InitHandler().Args().Size() == 1) { + applier = Build(ctx, pos) + .Apply(tuple.Trait().Cast().InitHandler()) + .With(0, initItemArg) + .Done(); + } else { + applier = Build(ctx, pos) + .Apply(tuple.Trait().Cast().InitHandler()) + .With(0, initItemArg) + .With(1) + .Literal().Build(ToString(index)) + .Build() + .Done(); + } + + structItems.push_back(Build(ctx, pos) + .Name().Build(BuildColumnName(tuple.ColumnName())) + .Value(applier) + .Done()); + ++index; + } + + return Build(ctx, pos) + .Args({initItemArg}) + .Body() + .Add(structItems) + .Build() + .Done() + .Ptr(); +} + +TExprNode::TPtr BuildUpdateHopLambda(const TCoAggregate& aggregate, TExprContext& ctx) { + const auto pos = aggregate.Pos(); + const auto aggregateHandlers = aggregate.Handlers(); + + const auto updateItemArg = Build(ctx, pos).Name("item").Done(); + const auto updateStateArg = Build(ctx, pos).Name("state").Done(); + + TVector structItems; + structItems.reserve(aggregateHandlers.Size()); + + i32 index = 0; + for (const auto& handler : aggregateHandlers) { + const auto tuple = handler.Cast(); + const TString columnName = BuildColumnName(tuple.ColumnName()); + + const auto member = Build(ctx, pos) + .Struct(updateStateArg) + .Name().Build(columnName) + .Done(); + + TMaybeNode applier; + if (tuple.Trait().Cast().UpdateHandler().Args().Size() == 2) { + applier = Build(ctx, pos) + .Apply(tuple.Trait().Cast().UpdateHandler()) + .With(0, updateItemArg) + .With(1, member) + .Done(); + } else { + applier = Build(ctx, pos) + .Apply(tuple.Trait().Cast().UpdateHandler()) + .With(0, updateItemArg) + .With(1, member) + .With(2) + .Literal().Build(ToString(index)) + .Build() + .Done(); + } + + structItems.push_back(Build(ctx, pos) + .Name().Build(columnName) + .Value(applier) + .Done()); + ++index; + } + + return Build(ctx, pos) + .Args({updateItemArg, updateStateArg}) + .Body() + .Add(structItems) + .Build() + .Done() + .Ptr(); +} + +TExprNode::TPtr BuildMergeHopLambda(const TCoAggregate& aggregate, TExprContext& ctx) { + const auto pos = aggregate.Pos(); + const auto& aggregateHandlers = aggregate.Handlers(); + + const auto mergeState1Arg = Build(ctx, pos).Name("state1").Done(); + const auto mergeState2Arg = Build(ctx, pos).Name("state2").Done(); + + TVector structItems; + structItems.reserve(aggregateHandlers.Size()); + + for (const auto& handler : aggregateHandlers) { + const auto tuple = handler.Cast(); + const TString columnName = BuildColumnName(tuple.ColumnName()); + + const auto member1 = Build(ctx, pos) + .Struct(mergeState1Arg) + .Name().Build(columnName) + .Done(); + const auto member2 = Build(ctx, pos) + .Struct(mergeState2Arg) + .Name().Build(columnName) + .Done(); + + structItems.push_back(Build(ctx, pos) + .Name().Build(columnName) + .Value() + .Apply(tuple.Trait().Cast().MergeHandler()) + .With(0, member1) + .With(1, member2) + .Build() + .Done()); + } + + return Build(ctx, pos) + .Args({mergeState1Arg, mergeState2Arg}) + .Body() + .Add(structItems) + .Build() + .Done() + .Ptr(); +} + +TExprNode::TPtr BuildFinishHopLambda( + const TCoAggregate& aggregate, + const TVector& actualGroupKeys, + const TString& hoppingColumn, + TExprContext& ctx) +{ + const auto pos = aggregate.Pos(); + const auto aggregateHandlers = aggregate.Handlers(); + + const auto finishKeyArg = Build(ctx, pos).Name("key").Done(); + const auto finishStateArg = Build(ctx, pos).Name("state").Done(); + const auto finishTimeArg = Build(ctx, pos).Name("time").Done(); + + TVector structItems; + structItems.reserve(actualGroupKeys.size() + aggregateHandlers.Size() + 1); + + if (actualGroupKeys.size() == 1) { + structItems.push_back(Build(ctx, pos) + .Name().Build(actualGroupKeys[0]) + .Value(finishKeyArg) + .Done()); + } else { + for (size_t i = 0; i < actualGroupKeys.size(); ++i) { + structItems.push_back(Build(ctx, pos) + .Name().Build(actualGroupKeys[i]) + .Value() + .Tuple(finishKeyArg) + .Index() + .Value(ToString(i)) + .Build() + .Build() + .Done()); + } + } + + for (const auto& handler : aggregateHandlers) { + const auto tuple = handler.Cast(); + const TString compoundColumnName = BuildColumnName(tuple.ColumnName()); + + const auto member = Build(ctx, pos) + .Struct(finishStateArg) + .Name().Build(compoundColumnName) + .Done(); + + if (tuple.ColumnName().Maybe()) { + structItems.push_back(Build(ctx, pos) + .Name().Build(compoundColumnName) + .Value() + .Apply(tuple.Trait().Cast().FinishHandler()) + .With(0, member) + .Build() + .Done()); + + continue; + } + + if (const auto namesList = tuple.ColumnName().Maybe()) { + const auto expApplier = Build(ctx, pos) + .Apply(tuple.Trait().Cast().FinishHandler()) + .With(0, member) + .Done(); + + int index = 0; + for (const auto columnName : namesList.Cast()) { + const auto extracter = Build(ctx, pos) + .Tuple(expApplier) + .Index().Build(index++) + .Done(); + + structItems.push_back(Build(ctx, pos) + .Name(columnName) + .Value(extracter) + .Done()); + } + + continue; + } + + YQL_ENSURE(false, "Invalid node. Expected Atom or AtomList, but received: " + << tuple.ColumnName().Ptr()->Dump()); + } + + structItems.push_back(Build(ctx, pos) + .Name().Build(hoppingColumn) + .Value(finishTimeArg) + .Done()); + + return Build(ctx, pos) + .Args({finishKeyArg, finishStateArg, finishTimeArg}) + .Body() + .Add(structItems) + .Build() + .Done() + .Ptr(); +} + +TExprNode::TPtr BuildSaveHopLambda(const TCoAggregate& aggregate, TExprContext& ctx) { + const auto pos = aggregate.Pos(); + const auto aggregateHandlers = aggregate.Handlers(); + + const auto saveStateArg = Build(ctx, pos).Name("state").Done(); + + TVector structItems; + structItems.reserve(aggregateHandlers.Size()); + + for (const auto& handler : aggregateHandlers) { + const auto tuple = handler.Cast(); + const TString columnName = BuildColumnName(tuple.ColumnName()); + + const auto member = Build(ctx, pos) + .Struct(saveStateArg) + .Name().Build(columnName) + .Done(); + + structItems.push_back(Build(ctx, pos) + .Name().Build(columnName) + .Value() + .Apply(tuple.Trait().Cast().SaveHandler()) + .With(0, member) + .Build() + .Done()); + } + + return Build(ctx, pos) + .Args({saveStateArg}) + .Body() + .Add(structItems) + .Build() + .Done() + .Ptr(); +} + +TExprNode::TPtr BuildLoadHopLambda(const TCoAggregate& aggregate, TExprContext& ctx) { + const auto pos = aggregate.Pos(); + const auto aggregateHandlers = aggregate.Handlers(); + + TCoArgument loadStateArg = Build(ctx, pos).Name("state").Done(); + + TVector structItems; + structItems.reserve(aggregateHandlers.Size()); + + for (const auto& handler : aggregateHandlers) { + const auto tuple = handler.Cast(); + const TString columnName = BuildColumnName(tuple.ColumnName()); + + const auto member = Build(ctx, pos) + .Struct(loadStateArg) + .Name().Build(columnName) + .Done(); + + structItems.push_back(Build(ctx, pos) + .Name().Build(columnName) + .Value() + .Apply(tuple.Trait().Cast().LoadHandler()) + .With(0, member) + .Build() + .Done()); + } + + return Build(ctx, pos) + .Args({loadStateArg}) + .Body() + .Add(structItems) + .Build() + .Done() + .Ptr(); +} + +} // NYql::NHopping diff --git a/ydb/library/yql/core/yql_opt_hopping.h b/ydb/library/yql/core/yql_opt_hopping.h new file mode 100644 index 000000000000..a9c2f458bb08 --- /dev/null +++ b/ydb/library/yql/core/yql_opt_hopping.h @@ -0,0 +1,64 @@ +#pragma once + +#include +#include + +#include + +namespace NYql::NHopping { + +struct THoppingTraits { + TString Column; + NYql::NNodes::TCoHoppingTraits Traits; + ui64 Hop; + ui64 Interval; + ui64 Delay; +}; + +struct TKeysDescription { + TVector PickleKeys; + TVector MemberKeys; + TVector FakeKeys; + + TKeysDescription(const TStructExprType& rowType, const NYql::NNodes::TCoAtomList& keys, const TString& hoppingColumn); + + TExprNode::TPtr BuildPickleLambda(TExprContext& ctx, TPositionHandle pos) const; + + TExprNode::TPtr BuildUnpickleLambda(TExprContext& ctx, TPositionHandle pos, const TStructExprType& rowType); + + TVector GetKeysList(TExprContext& ctx, TPositionHandle pos) const; + + TVector GetActualGroupKeys() const; + + bool NeedPickle() const; + + TExprNode::TPtr GetKeySelector(TExprContext& ctx, TPositionHandle pos, const TStructExprType* rowType); +}; + +TString BuildColumnName(const NYql::NNodes::TExprBase& column); + +bool IsLegacyHopping(const TExprNode::TPtr& hoppingSetting); + +void EnsureNotDistinct(const NYql::NNodes::TCoAggregate& aggregate); + +TMaybe ExtractHopTraits(const NYql::NNodes::TCoAggregate& aggregate, TExprContext& ctx, bool analyticsMode); + +TExprNode::TPtr BuildTimeExtractor(const NYql::NNodes::TCoHoppingTraits& hoppingTraits, TExprContext& ctx); + +TExprNode::TPtr BuildInitHopLambda(const NYql::NNodes::TCoAggregate& aggregate, TExprContext& ctx); + +TExprNode::TPtr BuildUpdateHopLambda(const NYql::NNodes::TCoAggregate& aggregate, TExprContext& ctx); + +TExprNode::TPtr BuildMergeHopLambda(const NYql::NNodes::TCoAggregate& aggregate, TExprContext& ctx); + +TExprNode::TPtr BuildFinishHopLambda( + const NYql::NNodes::TCoAggregate& aggregate, + const TVector& actualGroupKeys, + const TString& hoppingColumn, + TExprContext& ctx); + +TExprNode::TPtr BuildSaveHopLambda(const NYql::NNodes::TCoAggregate& aggregate, TExprContext& ctx); + +TExprNode::TPtr BuildLoadHopLambda(const NYql::NNodes::TCoAggregate& aggregate, TExprContext& ctx); + +} // namespace NYql::NHopping diff --git a/ydb/library/yql/dq/opt/dq_opt_hopping.cpp b/ydb/library/yql/dq/opt/dq_opt_hopping.cpp index 661fc2b6d724..daae3444ad6d 100644 --- a/ydb/library/yql/dq/opt/dq_opt_hopping.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_hopping.cpp @@ -1,6 +1,7 @@ #include "dq_opt_hopping.h" #include +#include #include #include @@ -19,366 +20,11 @@ using namespace NYql; using namespace NYql::NDq; +using namespace NYql::NHopping; using namespace NYql::NNodes; namespace { -struct THoppingTraits { - TString Column; - TCoHoppingTraits Traits; - ui64 Hop; - ui64 Interval; - ui64 Delay; -}; - - struct TKeysDescription { - TVector PickleKeys; - TVector MemberKeys; - TVector FakeKeys; - - TKeysDescription(const TStructExprType& rowType, const TCoAtomList& keys, const TString& hoppingColumn) { - for (const auto& key : keys) { - if (key.StringValue() == hoppingColumn) { - FakeKeys.emplace_back(key.StringValue()); - continue; - } - - const auto index = rowType.FindItem(key.StringValue()); - Y_ENSURE(index); - - auto itemType = rowType.GetItems()[*index]->GetItemType(); - if (RemoveOptionalType(itemType)->GetKind() == ETypeAnnotationKind::Data) { - MemberKeys.emplace_back(key.StringValue()); - continue; - } - - PickleKeys.emplace_back(key.StringValue()); - } - } - - TExprNode::TPtr BuildPickleLambda(TExprContext& ctx, TPositionHandle pos) const { - TCoArgument arg = Build(ctx, pos) - .Name("item") - .Done(); - - TExprBase body = arg; - - for (const auto& key : PickleKeys) { - const auto member = Build(ctx, pos) - .Name().Build(key) - .Struct(arg) - .Done() - .Ptr(); - - body = Build(ctx, pos) - .Struct(body) - .Name().Build(key) - .Item(ctx.NewCallable(pos, "StablePickle", { member })) - .Done(); - } - - return Build(ctx, pos) - .Args({arg}) - .Body(body) - .Done() - .Ptr(); - } - - TExprNode::TPtr BuildUnpickleLambda(TExprContext& ctx, TPositionHandle pos, const TStructExprType& rowType) { - TCoArgument arg = Build(ctx, pos) - .Name("item") - .Done(); - - TExprBase body = arg; - - for (const auto& key : PickleKeys) { - const auto index = rowType.FindItem(key); - Y_ENSURE(index); - - auto itemType = rowType.GetItems().at(*index)->GetItemType(); - const auto member = Build(ctx, pos) - .Name().Build(key) - .Struct(arg) - .Done() - .Ptr(); - - body = Build(ctx, pos) - .Struct(body) - .Name().Build(key) - .Item(ctx.NewCallable(pos, "Unpickle", { ExpandType(pos, *itemType, ctx), member })) - .Done(); - } - - return Build(ctx, pos) - .Args({arg}) - .Body(body) - .Done() - .Ptr(); - } - - TVector GetKeysList(TExprContext& ctx, TPositionHandle pos) const { - TVector res; - res.reserve(PickleKeys.size() + MemberKeys.size()); - - for (const auto& pickleKey : PickleKeys) { - res.emplace_back(Build(ctx, pos).Value(pickleKey).Done()); - } - for (const auto& memberKey : MemberKeys) { - res.emplace_back(Build(ctx, pos).Value(memberKey).Done()); - } - return res; - } - - TVector GetActualGroupKeys() { - TVector result; - result.reserve(PickleKeys.size() + MemberKeys.size()); - result.insert(result.end(), PickleKeys.begin(), PickleKeys.end()); - result.insert(result.end(), MemberKeys.begin(), MemberKeys.end()); - return result; - } - - bool NeedPickle() const { - return !PickleKeys.empty(); - } - - TExprNode::TPtr GetKeySelector(TExprContext& ctx, TPositionHandle pos, const TStructExprType* rowType) { - auto builder = Build(ctx, pos); - for (auto key : GetKeysList(ctx, pos)) { - builder.Add(std::move(key)); - } - return BuildKeySelector(pos, *rowType, builder.Build().Value().Ptr(), ctx); - } -}; - -TString BuildColumnName(const TExprBase& column) { - if (const auto columnName = column.Maybe()) { - return columnName.Cast().StringValue(); - } - - if (const auto columnNames = column.Maybe()) { - TStringBuilder columnNameBuilder; - for (const auto columnName : columnNames.Cast()) { - columnNameBuilder.append(columnName.StringValue()); - columnNameBuilder.append("_"); - } - return columnNameBuilder; - } - - YQL_ENSURE(false, "Invalid node. Expected Atom or AtomList, but received: " - << column.Ptr()->Dump()); -} - -bool IsLegacyHopping(const TExprNode::TPtr& hoppingSetting) { - return !hoppingSetting->Child(1)->IsList(); -} - -void EnsureNotDistinct(const TCoAggregate& aggregate) { - const auto& aggregateHandlers = aggregate.Handlers(); - - YQL_ENSURE( - AllOf(aggregateHandlers, [](const auto& t){ return !t.DistinctName(); }), - "Distinct is not supported for aggregation with hop"); -} - -TMaybe ExtractHopTraits(const TCoAggregate& aggregate, TExprContext& ctx, bool analyticsMode) { - const auto pos = aggregate.Pos(); - - const auto hopSetting = GetSetting(aggregate.Settings().Ref(), "hopping"); - if (!hopSetting) { - ctx.AddError(TIssue(ctx.GetPosition(pos), "Aggregate over stream must have 'hopping' setting")); - return Nothing(); - } - - const auto hoppingColumn = IsLegacyHopping(hopSetting) - ? "_yql_time" - : TString(hopSetting->Child(1)->Child(0)->Content()); - - const auto traitsNode = IsLegacyHopping(hopSetting) - ? hopSetting->Child(1) - : hopSetting->Child(1)->Child(1); - - const auto maybeTraits = TMaybeNode(traitsNode); - if (!maybeTraits) { - ctx.AddError(TIssue(ctx.GetPosition(pos), "Invalid 'hopping' setting in Aggregate")); - return Nothing(); - } - - const auto traits = maybeTraits.Cast(); - - const auto checkIntervalParam = [&] (TExprBase param) -> ui64 { - if (param.Maybe()) { - param = param.Cast().Input(); - } - if (!param.Maybe()) { - ctx.AddError(TIssue(ctx.GetPosition(pos), "Not an interval data ctor")); - return 0; - } - auto value = FromString(param.Cast().Literal().Value()); - if (value <= 0) { - ctx.AddError(TIssue(ctx.GetPosition(pos), "Interval value must be positive")); - return 0; - } - return (ui64)value; - }; - - const auto hop = checkIntervalParam(traits.Hop()); - if (!hop) { - return Nothing(); - } - const auto interval = checkIntervalParam(traits.Interval()); - if (!interval) { - return Nothing(); - } - const auto delay = checkIntervalParam(traits.Delay()); - if (!delay) { - return Nothing(); - } - - if (interval < hop) { - ctx.AddError(TIssue(ctx.GetPosition(pos), "Interval must be greater or equal then hop")); - return Nothing(); - } - if (delay < hop) { - ctx.AddError(TIssue(ctx.GetPosition(pos), "Delay must be greater or equal then hop")); - return Nothing(); - } - - const auto newTraits = Build(ctx, aggregate.Pos()) - .InitFrom(traits) - .DataWatermarks(analyticsMode - ? ctx.NewAtom(aggregate.Pos(), "false") - : traits.DataWatermarks().Ptr()) - .Done(); - - return THoppingTraits { - hoppingColumn, - newTraits, - hop, - interval, - delay - }; -} - -TExprNode::TPtr BuildTimeExtractor(const TCoHoppingTraits& hoppingTraits, TExprContext& ctx) { - const auto pos = hoppingTraits.Pos(); - - if (hoppingTraits.ItemType().Ref().GetTypeAnn()->Cast()->GetType()->Cast()->GetSize() == 0) { - // The case when no fields are used in lambda. F.e. when it has only DependsOn. - return ctx.DeepCopyLambda(hoppingTraits.TimeExtractor().Ref()); - } - - return Build(ctx, pos) - .Args({"item"}) - .Body() - .Apply(hoppingTraits.TimeExtractor()) - .With(0) - .Type(hoppingTraits.ItemType()) - .Value("item") - .Build() - .Build() - .Done() - .Ptr(); -} - -TExprNode::TPtr BuildInitHopLambda(const TCoAggregate& aggregate, TExprContext& ctx) { - const auto pos = aggregate.Pos(); - const auto& aggregateHandlers = aggregate.Handlers(); - - const auto initItemArg = Build(ctx, pos).Name("item").Done(); - - TVector structItems; - structItems.reserve(aggregateHandlers.Size()); - - ui32 index = 0; - for (const auto& handler : aggregateHandlers) { - const auto tuple = handler.Cast(); - - TMaybeNode applier; - if (tuple.Trait().Cast().InitHandler().Args().Size() == 1) { - applier = Build(ctx, pos) - .Apply(tuple.Trait().Cast().InitHandler()) - .With(0, initItemArg) - .Done(); - } else { - applier = Build(ctx, pos) - .Apply(tuple.Trait().Cast().InitHandler()) - .With(0, initItemArg) - .With(1) - .Literal().Build(ToString(index)) - .Build() - .Done(); - } - - structItems.push_back(Build(ctx, pos) - .Name().Build(BuildColumnName(tuple.ColumnName())) - .Value(applier) - .Done()); - ++index; - } - - return Build(ctx, pos) - .Args({initItemArg}) - .Body() - .Add(structItems) - .Build() - .Done() - .Ptr(); -} - -TExprNode::TPtr BuildUpdateHopLambda(const TCoAggregate& aggregate, TExprContext& ctx) { - const auto pos = aggregate.Pos(); - const auto aggregateHandlers = aggregate.Handlers(); - - const auto updateItemArg = Build(ctx, pos).Name("item").Done(); - const auto updateStateArg = Build(ctx, pos).Name("state").Done(); - - TVector structItems; - structItems.reserve(aggregateHandlers.Size()); - - i32 index = 0; - for (const auto& handler : aggregateHandlers) { - const auto tuple = handler.Cast(); - const TString columnName = BuildColumnName(tuple.ColumnName()); - - const auto member = Build(ctx, pos) - .Struct(updateStateArg) - .Name().Build(columnName) - .Done(); - - TMaybeNode applier; - if (tuple.Trait().Cast().UpdateHandler().Args().Size() == 2) { - applier = Build(ctx, pos) - .Apply(tuple.Trait().Cast().UpdateHandler()) - .With(0, updateItemArg) - .With(1, member) - .Done(); - } else { - applier = Build(ctx, pos) - .Apply(tuple.Trait().Cast().UpdateHandler()) - .With(0, updateItemArg) - .With(1, member) - .With(2) - .Literal().Build(ToString(index)) - .Build() - .Done(); - } - - structItems.push_back(Build(ctx, pos) - .Name().Build(columnName) - .Value(applier) - .Done()); - ++index; - } - - return Build(ctx, pos) - .Args({updateItemArg, updateStateArg}) - .Body() - .Add(structItems) - .Build() - .Done() - .Ptr(); -} - TExprNode::TPtr WrapToShuffle( const TKeysDescription& keysDescription, const TCoAggregate& aggregate, @@ -421,216 +67,6 @@ TExprNode::TPtr WrapToShuffle( .Ptr(); } -TExprNode::TPtr BuildMergeHopLambda(const TCoAggregate& aggregate, TExprContext& ctx) { - const auto pos = aggregate.Pos(); - const auto& aggregateHandlers = aggregate.Handlers(); - - const auto mergeState1Arg = Build(ctx, pos).Name("state1").Done(); - const auto mergeState2Arg = Build(ctx, pos).Name("state2").Done(); - - TVector structItems; - structItems.reserve(aggregateHandlers.Size()); - - for (const auto& handler : aggregateHandlers) { - const auto tuple = handler.Cast(); - const TString columnName = BuildColumnName(tuple.ColumnName()); - - const auto member1 = Build(ctx, pos) - .Struct(mergeState1Arg) - .Name().Build(columnName) - .Done(); - const auto member2 = Build(ctx, pos) - .Struct(mergeState2Arg) - .Name().Build(columnName) - .Done(); - - structItems.push_back(Build(ctx, pos) - .Name().Build(columnName) - .Value() - .Apply(tuple.Trait().Cast().MergeHandler()) - .With(0, member1) - .With(1, member2) - .Build() - .Done()); - } - - return Build(ctx, pos) - .Args({mergeState1Arg, mergeState2Arg}) - .Body() - .Add(structItems) - .Build() - .Done() - .Ptr(); -} - -TExprNode::TPtr BuildFinishHopLambda( - const TCoAggregate& aggregate, - const TVector& actualGroupKeys, - const TString& hoppingColumn, - TExprContext& ctx) -{ - const auto pos = aggregate.Pos(); - const auto aggregateHandlers = aggregate.Handlers(); - - const auto finishKeyArg = Build(ctx, pos).Name("key").Done(); - const auto finishStateArg = Build(ctx, pos).Name("state").Done(); - const auto finishTimeArg = Build(ctx, pos).Name("time").Done(); - - TVector structItems; - structItems.reserve(actualGroupKeys.size() + aggregateHandlers.Size() + 1); - - if (actualGroupKeys.size() == 1) { - structItems.push_back(Build(ctx, pos) - .Name().Build(actualGroupKeys[0]) - .Value(finishKeyArg) - .Done()); - } else { - for (size_t i = 0; i < actualGroupKeys.size(); ++i) { - structItems.push_back(Build(ctx, pos) - .Name().Build(actualGroupKeys[i]) - .Value() - .Tuple(finishKeyArg) - .Index() - .Value(ToString(i)) - .Build() - .Build() - .Done()); - } - } - - for (const auto& handler : aggregateHandlers) { - const auto tuple = handler.Cast(); - const TString compoundColumnName = BuildColumnName(tuple.ColumnName()); - - const auto member = Build(ctx, pos) - .Struct(finishStateArg) - .Name().Build(compoundColumnName) - .Done(); - - if (tuple.ColumnName().Maybe()) { - structItems.push_back(Build(ctx, pos) - .Name().Build(compoundColumnName) - .Value() - .Apply(tuple.Trait().Cast().FinishHandler()) - .With(0, member) - .Build() - .Done()); - - continue; - } - - if (const auto namesList = tuple.ColumnName().Maybe()) { - const auto expApplier = Build(ctx, pos) - .Apply(tuple.Trait().Cast().FinishHandler()) - .With(0, member) - .Done(); - - int index = 0; - for (const auto columnName : namesList.Cast()) { - const auto extracter = Build(ctx, pos) - .Tuple(expApplier) - .Index().Build(index++) - .Done(); - - structItems.push_back(Build(ctx, pos) - .Name(columnName) - .Value(extracter) - .Done()); - } - - continue; - } - - YQL_ENSURE(false, "Invalid node. Expected Atom or AtomList, but received: " - << tuple.ColumnName().Ptr()->Dump()); - } - - structItems.push_back(Build(ctx, pos) - .Name().Build(hoppingColumn) - .Value(finishTimeArg) - .Done()); - - return Build(ctx, pos) - .Args({finishKeyArg, finishStateArg, finishTimeArg}) - .Body() - .Add(structItems) - .Build() - .Done() - .Ptr(); -} - -TExprNode::TPtr BuildSaveHopLambda(const TCoAggregate& aggregate, TExprContext& ctx) { - const auto pos = aggregate.Pos(); - const auto aggregateHandlers = aggregate.Handlers(); - - const auto saveStateArg = Build(ctx, pos).Name("state").Done(); - - TVector structItems; - structItems.reserve(aggregateHandlers.Size()); - - for (const auto& handler : aggregateHandlers) { - const auto tuple = handler.Cast(); - const TString columnName = BuildColumnName(tuple.ColumnName()); - - const auto member = Build(ctx, pos) - .Struct(saveStateArg) - .Name().Build(columnName) - .Done(); - - structItems.push_back(Build(ctx, pos) - .Name().Build(columnName) - .Value() - .Apply(tuple.Trait().Cast().SaveHandler()) - .With(0, member) - .Build() - .Done()); - } - - return Build(ctx, pos) - .Args({saveStateArg}) - .Body() - .Add(structItems) - .Build() - .Done() - .Ptr(); -} - -TExprNode::TPtr BuildLoadHopLambda(const TCoAggregate& aggregate, TExprContext& ctx) { - const auto pos = aggregate.Pos(); - const auto aggregateHandlers = aggregate.Handlers(); - - TCoArgument loadStateArg = Build(ctx, pos).Name("state").Done(); - - TVector structItems; - structItems.reserve(aggregateHandlers.Size()); - - for (const auto& handler : aggregateHandlers) { - const auto tuple = handler.Cast(); - const TString columnName = BuildColumnName(tuple.ColumnName()); - - const auto member = Build(ctx, pos) - .Struct(loadStateArg) - .Name().Build(columnName) - .Done(); - - structItems.push_back(Build(ctx, pos) - .Name().Build(columnName) - .Value() - .Apply(tuple.Trait().Cast().LoadHandler()) - .With(0, member) - .Build() - .Done()); - } - - return Build(ctx, pos) - .Args({loadStateArg}) - .Body() - .Add(structItems) - .Build() - .Done() - .Ptr(); -} - TMaybe BuildWatermarkMode( const TCoAggregate& aggregate, const TCoHoppingTraits& hoppingTraits, diff --git a/ydb/library/yql/tests/sql/dq_file/part10/canondata/result.json b/ydb/library/yql/tests/sql/dq_file/part10/canondata/result.json index 638f73aa00b1..349132d99979 100644 --- a/ydb/library/yql/tests/sql/dq_file/part10/canondata/result.json +++ b/ydb/library/yql/tests/sql/dq_file/part10/canondata/result.json @@ -294,6 +294,34 @@ } ], "test.test[aggregate-group_by_column-default.txt-Results]": [], + "test.test[aggregate-group_by_hop_static-default.txt-Analyze]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1689644/763d9bd4404423a24deab02585b884f08692c90b/resource.tar.gz#test.test_aggregate-group_by_hop_static-default.txt-Analyze_/plan.txt" + } + ], + "test.test[aggregate-group_by_hop_static-default.txt-Debug]": [ + { + "checksum": "07d9a8f046f4661ba479dbaf70979aac", + "size": 1630, + "uri": "https://{canondata_backend}/1689644/763d9bd4404423a24deab02585b884f08692c90b/resource.tar.gz#test.test_aggregate-group_by_hop_static-default.txt-Debug_/opt.yql_patched" + } + ], + "test.test[aggregate-group_by_hop_static-default.txt-Plan]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1689644/763d9bd4404423a24deab02585b884f08692c90b/resource.tar.gz#test.test_aggregate-group_by_hop_static-default.txt-Plan_/plan.txt" + } + ], + "test.test[aggregate-group_by_hop_static-default.txt-Results]": [ + { + "checksum": "dc21a63cca5d7481363c2b47840f1e38", + "size": 3102, + "uri": "https://{canondata_backend}/1689644/763d9bd4404423a24deab02585b884f08692c90b/resource.tar.gz#test.test_aggregate-group_by_hop_static-default.txt-Results_/results.txt" + } + ], "test.test[aggregate-group_by_mul_gs_ru--Analyze]": [ { "checksum": "e78b8c0f6855d3df92663efab505204b", diff --git a/ydb/library/yql/tests/sql/dq_file/part18/canondata/result.json b/ydb/library/yql/tests/sql/dq_file/part18/canondata/result.json index e91ad14e3524..2e85aaccd286 100644 --- a/ydb/library/yql/tests/sql/dq_file/part18/canondata/result.json +++ b/ydb/library/yql/tests/sql/dq_file/part18/canondata/result.json @@ -322,6 +322,34 @@ "uri": "https://{canondata_backend}/1775059/e6328418d209e6f2afe65be714175e5a3ade006c/resource.tar.gz#test.test_aggregate-group_by_hop_only--Results_/results.txt" } ], + "test.test[aggregate-group_by_hop_static_list_key-default.txt-Analyze]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1130705/da7974592864104e97d4cfb7947d82f2379f0266/resource.tar.gz#test.test_aggregate-group_by_hop_static_list_key-default.txt-Analyze_/plan.txt" + } + ], + "test.test[aggregate-group_by_hop_static_list_key-default.txt-Debug]": [ + { + "checksum": "41d48b8937d3e4bcc583915a7460727d", + "size": 1946, + "uri": "https://{canondata_backend}/1925821/6132b4b967a7c6d2d9c522d4a344e781b4121793/resource.tar.gz#test.test_aggregate-group_by_hop_static_list_key-default.txt-Debug_/opt.yql_patched" + } + ], + "test.test[aggregate-group_by_hop_static_list_key-default.txt-Plan]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1130705/da7974592864104e97d4cfb7947d82f2379f0266/resource.tar.gz#test.test_aggregate-group_by_hop_static_list_key-default.txt-Plan_/plan.txt" + } + ], + "test.test[aggregate-group_by_hop_static_list_key-default.txt-Results]": [ + { + "checksum": "dc21a63cca5d7481363c2b47840f1e38", + "size": 3102, + "uri": "https://{canondata_backend}/1130705/da7974592864104e97d4cfb7947d82f2379f0266/resource.tar.gz#test.test_aggregate-group_by_hop_static_list_key-default.txt-Results_/results.txt" + } + ], "test.test[aggregate-group_compact_sorted_distinct--Analyze]": [ { "checksum": "683fe495c075d2b1f1efcc8737139f4c", diff --git a/ydb/library/yql/tests/sql/hybrid_file/part2/canondata/result.json b/ydb/library/yql/tests/sql/hybrid_file/part2/canondata/result.json index 6bb56791df4d..e59e9089a1b0 100644 --- a/ydb/library/yql/tests/sql/hybrid_file/part2/canondata/result.json +++ b/ydb/library/yql/tests/sql/hybrid_file/part2/canondata/result.json @@ -2871,9 +2871,9 @@ ], "test.test[window-full/session--Debug]": [ { - "checksum": "333f05b30f4566a815ef4c596c6fd1f6", - "size": 13899, - "uri": "https://{canondata_backend}/212715/b9f267b2022a251b638e7a1f1ebeb788c308ed2f/resource.tar.gz#test.test_window-full_session--Debug_/opt.yql_patched" + "checksum": "b06da41f9a9ea38646c43487f4b8b96a", + "size": 13340, + "uri": "https://{canondata_backend}/1775319/8ac8c87858e0db34f5a3c99b3f4ca1084cccbace/resource.tar.gz#test.test_window-full_session--Debug_/opt.yql_patched" } ], "test.test[window-full/session--Plan]": [ @@ -2885,9 +2885,9 @@ ], "test.test[window-full/session_aliases--Debug]": [ { - "checksum": "d5414bac5bc6a9e87cfabc5266971d85", - "size": 14757, - "uri": "https://{canondata_backend}/212715/b9f267b2022a251b638e7a1f1ebeb788c308ed2f/resource.tar.gz#test.test_window-full_session_aliases--Debug_/opt.yql_patched" + "checksum": "e021555a47e83d0b792765a8ee82be94", + "size": 14124, + "uri": "https://{canondata_backend}/1775319/8ac8c87858e0db34f5a3c99b3f4ca1084cccbace/resource.tar.gz#test.test_window-full_session_aliases--Debug_/opt.yql_patched" } ], "test.test[window-full/session_aliases--Plan]": [ diff --git a/ydb/library/yql/tests/sql/sql2yql/canondata/result.json b/ydb/library/yql/tests/sql/sql2yql/canondata/result.json index ab33e315fbbf..4b4262ce670a 100644 --- a/ydb/library/yql/tests/sql/sql2yql/canondata/result.json +++ b/ydb/library/yql/tests/sql/sql2yql/canondata/result.json @@ -2183,6 +2183,20 @@ "uri": "https://{canondata_backend}/1784117/d56ae82ad9d30397a41490647be1bd2124718f98/resource.tar.gz#test_sql2yql.test_aggregate-group_by_hop_star_/sql.yql" } ], + "test_sql2yql.test[aggregate-group_by_hop_static]": [ + { + "checksum": "a7a563dc87672b141c8209b38c0d446c", + "size": 3368, + "uri": "https://{canondata_backend}/1925821/aca60c4aca6b335189396eb0d636b37dbc38e5d9/resource.tar.gz#test_sql2yql.test_aggregate-group_by_hop_static_/sql.yql" + } + ], + "test_sql2yql.test[aggregate-group_by_hop_static_list_key]": [ + { + "checksum": "4b8a74647da998a54e0ccffae0f365d6", + "size": 3547, + "uri": "https://{canondata_backend}/1937492/6205ff455a623f62222bc8ee2c2ee5c2e7ee4174/resource.tar.gz#test_sql2yql.test_aggregate-group_by_hop_static_list_key_/sql.yql" + } + ], "test_sql2yql.test[aggregate-group_by_mul_gb_ru]": [ { "checksum": "002e7ddce42c228debb7382e9f8ea1d3", @@ -21419,6 +21433,20 @@ "uri": "https://{canondata_backend}/1880306/64654158d6bfb1289c66c626a8162239289559d0/resource.tar.gz#test_sql_format.test_aggregate-group_by_hop_star_/formatted.sql" } ], + "test_sql_format.test[aggregate-group_by_hop_static]": [ + { + "checksum": "a6f19201a2a81c7308fe9947b59276bf", + "size": 955, + "uri": "https://{canondata_backend}/1925821/aca60c4aca6b335189396eb0d636b37dbc38e5d9/resource.tar.gz#test_sql_format.test_aggregate-group_by_hop_static_/formatted.sql" + } + ], + "test_sql_format.test[aggregate-group_by_hop_static_list_key]": [ + { + "checksum": "3d3184e982097fa7fed63bdeef6c1fae", + "size": 976, + "uri": "https://{canondata_backend}/1937492/6205ff455a623f62222bc8ee2c2ee5c2e7ee4174/resource.tar.gz#test_sql_format.test_aggregate-group_by_hop_static_list_key_/formatted.sql" + } + ], "test_sql_format.test[aggregate-group_by_mul_gb_ru]": [ { "checksum": "adae92846c7098e2ea3468096a13ffae", diff --git a/ydb/library/yql/tests/sql/suites/aggregate/group_by_hop_static.sql b/ydb/library/yql/tests/sql/suites/aggregate/group_by_hop_static.sql new file mode 100644 index 000000000000..fec507c827a5 --- /dev/null +++ b/ydb/library/yql/tests/sql/suites/aggregate/group_by_hop_static.sql @@ -0,0 +1,26 @@ +/* syntax version 1 */ +/* postgres can not */ +/* ytfile can not */ +/* yt can not */ + +$input = SELECT * FROM AS_TABLE([ + <|"time":"2024-01-01T00:00:01Z", "user": 1|>, + <|"time":"2024-01-01T00:00:02Z", "user": 1|>, + <|"time":"2024-01-01T00:00:03Z", "user": 1|>, + <|"time":"2024-01-01T00:00:01Z", "user": 2|>, + <|"time":"2024-01-01T00:00:02Z", "user": 2|>, + <|"time":"2024-01-01T00:00:03Z", "user": 2|>, + <|"time":"2024-01-01T00:00:01Z", "user": 2|>, + <|"time":"2024-01-01T00:00:02Z", "user": 2|>, + <|"time":"2024-01-01T00:00:03Z", "user": 2|>, + <|"time":"2024-01-01T00:00:01Z", "user": 3|>, + <|"time":"2024-01-01T00:00:02Z", "user": 3|>, + <|"time":"2024-01-01T00:00:03Z", "user": 3|> +]); + +SELECT + user, + COUNT(*) as count, + HOP_START() as start, +FROM $input +GROUP BY HOP(CAST(time as Timestamp), 'PT1S', 'PT1S', 'PT1S'), user; diff --git a/ydb/library/yql/tests/sql/suites/aggregate/group_by_hop_static_list_key.sql b/ydb/library/yql/tests/sql/suites/aggregate/group_by_hop_static_list_key.sql new file mode 100644 index 000000000000..3639207bb340 --- /dev/null +++ b/ydb/library/yql/tests/sql/suites/aggregate/group_by_hop_static_list_key.sql @@ -0,0 +1,26 @@ +/* syntax version 1 */ +/* postgres can not */ +/* ytfile can not */ +/* yt can not */ + +$input = SELECT * FROM AS_TABLE([ + <|"time":"2024-01-01T00:00:01Z", "user": 1|>, + <|"time":"2024-01-01T00:00:02Z", "user": 1|>, + <|"time":"2024-01-01T00:00:03Z", "user": 1|>, + <|"time":"2024-01-01T00:00:01Z", "user": 2|>, + <|"time":"2024-01-01T00:00:02Z", "user": 2|>, + <|"time":"2024-01-01T00:00:03Z", "user": 2|>, + <|"time":"2024-01-01T00:00:01Z", "user": 2|>, + <|"time":"2024-01-01T00:00:02Z", "user": 2|>, + <|"time":"2024-01-01T00:00:03Z", "user": 2|>, + <|"time":"2024-01-01T00:00:01Z", "user": 3|>, + <|"time":"2024-01-01T00:00:02Z", "user": 3|>, + <|"time":"2024-01-01T00:00:03Z", "user": 3|> +]); + +SELECT + user, + COUNT(*) as count, + HOP_START() as start, +FROM $input +GROUP BY HOP(CAST(time as Timestamp), 'PT1S', 'PT1S', 'PT1S'), user, AsList(user, 0); diff --git a/ydb/library/yql/tests/sql/yt_native_file/part19/canondata/result.json b/ydb/library/yql/tests/sql/yt_native_file/part19/canondata/result.json index 2c743f411c5a..91d5183c9879 100644 --- a/ydb/library/yql/tests/sql/yt_native_file/part19/canondata/result.json +++ b/ydb/library/yql/tests/sql/yt_native_file/part19/canondata/result.json @@ -2924,9 +2924,9 @@ ], "test.test[window-full/session--Debug]": [ { - "checksum": "a7f5b924c596e4861cfff98981b5f071", - "size": 11042, - "uri": "https://{canondata_backend}/1937027/16b7289b1b8f5fdff728155d836fa2b238949b2d/resource.tar.gz#test.test_window-full_session--Debug_/opt.yql" + "checksum": "fd79f82807ae5a2b2ac7181f3da01c58", + "size": 11314, + "uri": "https://{canondata_backend}/1942173/f70acaf8d9dbbd62a5305d5424f4de9ac3080ddc/resource.tar.gz#test.test_window-full_session--Debug_/opt.yql" } ], "test.test[window-full/session--Plan]": [ diff --git a/ydb/library/yql/tests/sql/yt_native_file/part6/canondata/result.json b/ydb/library/yql/tests/sql/yt_native_file/part6/canondata/result.json index 99e67b4bcc33..bc9fd6f3f2fe 100644 --- a/ydb/library/yql/tests/sql/yt_native_file/part6/canondata/result.json +++ b/ydb/library/yql/tests/sql/yt_native_file/part6/canondata/result.json @@ -2744,9 +2744,9 @@ ], "test.test[window-full/session_aliases--Debug]": [ { - "checksum": "88d37ebd17099f93d640d857b6198de6", - "size": 11552, - "uri": "https://{canondata_backend}/1917492/ddc0a6b96495a49628829c42f1882eff49e71e11/resource.tar.gz#test.test_window-full_session_aliases--Debug_/opt.yql" + "checksum": "751c1ae97702b51753f626bfa02facbd", + "size": 11764, + "uri": "https://{canondata_backend}/212715/c96504db58dd13ce5e79be71afa29b676fde90a1/resource.tar.gz#test.test_window-full_session_aliases--Debug_/opt.yql" } ], "test.test[window-full/session_aliases--Plan]": [ From 3a6e84f37f5bb27a43ae21bfbfd061d3cc568ea0 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 16 Oct 2024 10:52:52 +0300 Subject: [PATCH 40/56] YQ: add simdjson to q-stable (#10479) Co-authored-by: Maxim Yurchuk --- contrib/libs/simdjson/AUTHORS | 4 + contrib/libs/simdjson/CONTRIBUTING.md | 103 ++ contrib/libs/simdjson/HACKING.md | 332 +++++ contrib/libs/simdjson/LICENSE | 201 +++ contrib/libs/simdjson/README.md | 227 +++ contrib/libs/simdjson/SECURITY.md | 7 + contrib/libs/simdjson/include/simdjson.h | 57 + .../libs/simdjson/include/simdjson/arm64.h | 8 + .../simdjson/include/simdjson/arm64/base.h | 26 + .../simdjson/include/simdjson/arm64/begin.h | 10 + .../include/simdjson/arm64/bitmanipulation.h | 106 ++ .../simdjson/include/simdjson/arm64/bitmask.h | 44 + .../simdjson/include/simdjson/arm64/end.h | 6 + .../include/simdjson/arm64/implementation.h | 31 + .../include/simdjson/arm64/intrinsics.h | 14 + .../simdjson/arm64/numberparsing_defs.h | 56 + .../include/simdjson/arm64/ondemand.h | 8 + .../simdjson/include/simdjson/arm64/simd.h | 497 +++++++ .../simdjson/arm64/stringparsing_defs.h | 53 + contrib/libs/simdjson/include/simdjson/base.h | 60 + .../libs/simdjson/include/simdjson/builtin.h | 33 + .../simdjson/include/simdjson/builtin/base.h | 41 + .../include/simdjson/builtin/implementation.h | 42 + .../include/simdjson/builtin/ondemand.h | 40 + .../simdjson/include/simdjson/common_defs.h | 347 +++++ .../include/simdjson/compiler_check.h | 53 + contrib/libs/simdjson/include/simdjson/dom.h | 23 + .../simdjson/include/simdjson/dom/array-inl.h | 181 +++ .../simdjson/include/simdjson/dom/array.h | 183 +++ .../libs/simdjson/include/simdjson/dom/base.h | 54 + .../include/simdjson/dom/document-inl.h | 159 ++ .../simdjson/include/simdjson/dom/document.h | 91 ++ .../simdjson/dom/document_stream-inl.h | 348 +++++ .../include/simdjson/dom/document_stream.h | 322 ++++ .../include/simdjson/dom/element-inl.h | 473 ++++++ .../simdjson/include/simdjson/dom/element.h | 552 +++++++ .../include/simdjson/dom/object-inl.h | 263 ++++ .../simdjson/include/simdjson/dom/object.h | 274 ++++ .../include/simdjson/dom/parser-inl.h | 258 ++++ .../simdjson/include/simdjson/dom/parser.h | 650 ++++++++ .../include/simdjson/dom/serialization-inl.h | 536 +++++++ .../include/simdjson/dom/serialization.h | 260 ++++ .../simdjson/include/simdjson/error-inl.h | 184 +++ .../libs/simdjson/include/simdjson/error.h | 318 ++++ .../libs/simdjson/include/simdjson/fallback.h | 8 + .../simdjson/include/simdjson/fallback/base.h | 19 + .../include/simdjson/fallback/begin.h | 5 + .../simdjson/fallback/bitmanipulation.h | 48 + .../simdjson/include/simdjson/fallback/end.h | 5 + .../simdjson/fallback/implementation.h | 34 + .../simdjson/fallback/numberparsing_defs.h | 80 + .../include/simdjson/fallback/ondemand.h | 8 + .../simdjson/fallback/stringparsing_defs.h | 36 + .../include/simdjson/generic/amalgamated.h | 12 + .../include/simdjson/generic/atomparsing.h | 77 + .../simdjson/include/simdjson/generic/base.h | 51 + .../include/simdjson/generic/dependencies.h | 20 + .../generic/dom_parser_implementation.h | 89 ++ .../implementation_simdjson_result_base-inl.h | 90 ++ .../implementation_simdjson_result_base.h | 134 ++ .../include/simdjson/generic/jsoncharutils.h | 104 ++ .../include/simdjson/generic/numberparsing.h | 1310 +++++++++++++++++ .../simdjson/generic/ondemand/amalgamated.h | 42 + .../simdjson/generic/ondemand/array-inl.h | 283 ++++ .../include/simdjson/generic/ondemand/array.h | 216 +++ .../generic/ondemand/array_iterator-inl.h | 78 + .../generic/ondemand/array_iterator.h | 96 ++ .../include/simdjson/generic/ondemand/base.h | 47 + .../simdjson/generic/ondemand/dependencies.h | 17 + .../simdjson/generic/ondemand/document-inl.h | 917 ++++++++++++ .../simdjson/generic/ondemand/document.h | 914 ++++++++++++ .../generic/ondemand/document_stream-inl.h | 433 ++++++ .../generic/ondemand/document_stream.h | 337 +++++ .../simdjson/generic/ondemand/field-inl.h | 129 ++ .../include/simdjson/generic/ondemand/field.h | 113 ++ .../generic/ondemand/json_iterator-inl.h | 444 ++++++ .../simdjson/generic/ondemand/json_iterator.h | 338 +++++ .../simdjson/generic/ondemand/json_type-inl.h | 117 ++ .../simdjson/generic/ondemand/json_type.h | 160 ++ .../simdjson/generic/ondemand/logger-inl.h | 225 +++ .../simdjson/generic/ondemand/logger.h | 58 + .../simdjson/generic/ondemand/object-inl.h | 276 ++++ .../simdjson/generic/ondemand/object.h | 258 ++++ .../generic/ondemand/object_iterator-inl.h | 138 ++ .../generic/ondemand/object_iterator.h | 80 + .../simdjson/generic/ondemand/parser-inl.h | 205 +++ .../simdjson/generic/ondemand/parser.h | 372 +++++ .../generic/ondemand/raw_json_string-inl.h | 203 +++ .../generic/ondemand/raw_json_string.h | 206 +++ .../generic/ondemand/serialization-inl.h | 233 +++ .../simdjson/generic/ondemand/serialization.h | 103 ++ .../generic/ondemand/token_iterator-inl.h | 94 ++ .../generic/ondemand/token_iterator.h | 158 ++ .../simdjson/generic/ondemand/value-inl.h | 542 +++++++ .../include/simdjson/generic/ondemand/value.h | 781 ++++++++++ .../generic/ondemand/value_iterator-inl.h | 1091 ++++++++++++++ .../generic/ondemand/value_iterator.h | 492 +++++++ .../libs/simdjson/include/simdjson/haswell.h | 8 + .../simdjson/include/simdjson/haswell/base.h | 27 + .../simdjson/include/simdjson/haswell/begin.h | 14 + .../simdjson/haswell/bitmanipulation.h | 71 + .../include/simdjson/haswell/bitmask.h | 30 + .../simdjson/include/simdjson/haswell/end.h | 9 + .../include/simdjson/haswell/implementation.h | 36 + .../include/simdjson/haswell/intrinsics.h | 52 + .../simdjson/haswell/numberparsing_defs.h | 61 + .../include/simdjson/haswell/ondemand.h | 8 + .../simdjson/include/simdjson/haswell/simd.h | 372 +++++ .../simdjson/haswell/stringparsing_defs.h | 48 + .../libs/simdjson/include/simdjson/icelake.h | 8 + .../simdjson/include/simdjson/icelake/base.h | 20 + .../simdjson/include/simdjson/icelake/begin.h | 13 + .../simdjson/icelake/bitmanipulation.h | 70 + .../include/simdjson/icelake/bitmask.h | 30 + .../simdjson/include/simdjson/icelake/end.h | 9 + .../include/simdjson/icelake/implementation.h | 36 + .../include/simdjson/icelake/intrinsics.h | 60 + .../simdjson/icelake/numberparsing_defs.h | 57 + .../include/simdjson/icelake/ondemand.h | 8 + .../simdjson/include/simdjson/icelake/simd.h | 372 +++++ .../simdjson/icelake/stringparsing_defs.h | 48 + .../include/simdjson/implementation.h | 230 +++ .../simdjson/implementation_detection.h | 168 +++ .../include/simdjson/internal/atomic_ptr.h | 31 + .../internal/dom_parser_implementation.h | 252 ++++ .../simdjson/internal/instruction_set.h | 77 + .../simdjson/internal/jsoncharutils_tables.h | 26 + .../simdjson/internal/jsonformatutils.h | 64 + .../simdjson/internal/numberparsing_tables.h | 59 + .../simdjson/internal/simdprune_tables.h | 21 + .../include/simdjson/internal/tape_ref-inl.h | 118 ++ .../include/simdjson/internal/tape_ref.h | 49 + .../include/simdjson/internal/tape_type.h | 28 + .../simdjson/include/simdjson/jsonioutil.h | 22 + contrib/libs/simdjson/include/simdjson/lasx.h | 8 + .../simdjson/include/simdjson/lasx/base.h | 26 + .../simdjson/include/simdjson/lasx/begin.h | 10 + .../include/simdjson/lasx/bitmanipulation.h | 50 + .../simdjson/include/simdjson/lasx/bitmask.h | 31 + .../libs/simdjson/include/simdjson/lasx/end.h | 6 + .../include/simdjson/lasx/implementation.h | 31 + .../include/simdjson/lasx/intrinsics.h | 14 + .../simdjson/lasx/numberparsing_defs.h | 41 + .../simdjson/include/simdjson/lasx/ondemand.h | 8 + .../simdjson/include/simdjson/lasx/simd.h | 376 +++++ .../simdjson/lasx/stringparsing_defs.h | 47 + contrib/libs/simdjson/include/simdjson/lsx.h | 8 + .../libs/simdjson/include/simdjson/lsx/base.h | 26 + .../simdjson/include/simdjson/lsx/begin.h | 10 + .../include/simdjson/lsx/bitmanipulation.h | 50 + .../simdjson/include/simdjson/lsx/bitmask.h | 31 + .../libs/simdjson/include/simdjson/lsx/end.h | 6 + .../include/simdjson/lsx/implementation.h | 31 + .../include/simdjson/lsx/intrinsics.h | 14 + .../include/simdjson/lsx/numberparsing_defs.h | 41 + .../simdjson/include/simdjson/lsx/ondemand.h | 8 + .../libs/simdjson/include/simdjson/lsx/simd.h | 354 +++++ .../include/simdjson/lsx/stringparsing_defs.h | 53 + .../libs/simdjson/include/simdjson/minify.h | 30 + .../libs/simdjson/include/simdjson/ondemand.h | 13 + .../include/simdjson/padded_string-inl.h | 190 +++ .../simdjson/include/simdjson/padded_string.h | 183 +++ .../include/simdjson/padded_string_view-inl.h | 59 + .../include/simdjson/padded_string_view.h | 88 ++ .../simdjson/include/simdjson/portability.h | 204 +++ .../libs/simdjson/include/simdjson/ppc64.h | 8 + .../simdjson/include/simdjson/ppc64/base.h | 26 + .../simdjson/include/simdjson/ppc64/begin.h | 10 + .../include/simdjson/ppc64/bitmanipulation.h | 78 + .../simdjson/include/simdjson/ppc64/bitmask.h | 46 + .../simdjson/include/simdjson/ppc64/end.h | 6 + .../include/simdjson/ppc64/implementation.h | 40 + .../include/simdjson/ppc64/intrinsics.h | 23 + .../simdjson/ppc64/numberparsing_defs.h | 65 + .../include/simdjson/ppc64/ondemand.h | 8 + .../simdjson/include/simdjson/ppc64/simd.h | 472 ++++++ .../simdjson/ppc64/stringparsing_defs.h | 65 + .../libs/simdjson/include/simdjson/simdjson.h | 11 + .../include/simdjson/simdjson_version.h | 26 + .../libs/simdjson/include/simdjson/westmere.h | 8 + .../simdjson/include/simdjson/westmere/base.h | 29 + .../include/simdjson/westmere/begin.h | 13 + .../simdjson/westmere/bitmanipulation.h | 79 + .../include/simdjson/westmere/bitmask.h | 30 + .../simdjson/include/simdjson/westmere/end.h | 9 + .../simdjson/westmere/implementation.h | 32 + .../include/simdjson/westmere/intrinsics.h | 31 + .../simdjson/westmere/numberparsing_defs.h | 59 + .../include/simdjson/westmere/ondemand.h | 8 + .../simdjson/include/simdjson/westmere/simd.h | 338 +++++ .../simdjson/westmere/stringparsing_defs.h | 47 + contrib/libs/simdjson/src/arm64.cpp | 172 +++ contrib/libs/simdjson/src/base.h | 6 + contrib/libs/simdjson/src/fallback.cpp | 410 ++++++ contrib/libs/simdjson/src/from_chars.cpp | 606 ++++++++ .../libs/simdjson/src/generic/amalgamated.h | 7 + contrib/libs/simdjson/src/generic/base.h | 19 + .../libs/simdjson/src/generic/dependencies.h | 10 + .../src/generic/dom_parser_implementation.h | 21 + .../src/generic/json_character_block.h | 27 + .../simdjson/src/generic/stage1/amalgamated.h | 13 + .../libs/simdjson/src/generic/stage1/base.h | 35 + .../src/generic/stage1/buf_block_reader.h | 116 ++ .../src/generic/stage1/dependencies.h | 4 + .../generic/stage1/find_next_document_index.h | 105 ++ .../src/generic/stage1/json_escape_scanner.h | 151 ++ .../src/generic/stage1/json_minifier.h | 104 ++ .../src/generic/stage1/json_scanner.h | 168 +++ .../src/generic/stage1/json_string_scanner.h | 99 ++ .../generic/stage1/json_structural_indexer.h | 358 +++++ .../generic/stage1/utf8_lookup4_algorithm.h | 209 +++ .../src/generic/stage1/utf8_validator.h | 45 + .../simdjson/src/generic/stage2/amalgamated.h | 10 + .../libs/simdjson/src/generic/stage2/base.h | 23 + .../src/generic/stage2/dependencies.h | 7 + .../src/generic/stage2/json_iterator.h | 328 +++++ .../libs/simdjson/src/generic/stage2/logger.h | 100 ++ .../src/generic/stage2/stringparsing.h | 244 +++ .../src/generic/stage2/structural_iterator.h | 64 + .../src/generic/stage2/tape_builder.h | 297 ++++ .../simdjson/src/generic/stage2/tape_writer.h | 117 ++ contrib/libs/simdjson/src/haswell.cpp | 169 +++ contrib/libs/simdjson/src/icelake.cpp | 215 +++ contrib/libs/simdjson/src/implementation.cpp | 330 +++++ .../simdjson/src/internal/error_tables.cpp | 48 + .../libs/simdjson/src/internal/isadetection.h | 247 ++++ .../src/internal/jsoncharutils_tables.cpp | 197 +++ .../src/internal/numberparsing_tables.cpp | 681 +++++++++ .../src/internal/simdprune_tables.cpp | 138 ++ contrib/libs/simdjson/src/simdjson.cpp | 50 + contrib/libs/simdjson/src/to_chars.cpp | 954 ++++++++++++ contrib/libs/simdjson/src/westmere.cpp | 174 +++ contrib/libs/simdjson/ya.make | 35 + ydb/ci/rightlib.txt | 2 +- 234 files changed, 33201 insertions(+), 1 deletion(-) create mode 100644 contrib/libs/simdjson/AUTHORS create mode 100644 contrib/libs/simdjson/CONTRIBUTING.md create mode 100644 contrib/libs/simdjson/HACKING.md create mode 100644 contrib/libs/simdjson/LICENSE create mode 100644 contrib/libs/simdjson/README.md create mode 100644 contrib/libs/simdjson/SECURITY.md create mode 100644 contrib/libs/simdjson/include/simdjson.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64/begin.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64/bitmanipulation.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64/bitmask.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64/end.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64/implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64/intrinsics.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64/numberparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64/ondemand.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64/simd.h create mode 100644 contrib/libs/simdjson/include/simdjson/arm64/stringparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/builtin.h create mode 100644 contrib/libs/simdjson/include/simdjson/builtin/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/builtin/implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/builtin/ondemand.h create mode 100644 contrib/libs/simdjson/include/simdjson/common_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/compiler_check.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/array-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/array.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/document-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/document.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/document_stream-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/document_stream.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/element-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/element.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/object-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/object.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/parser-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/parser.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/serialization-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/dom/serialization.h create mode 100644 contrib/libs/simdjson/include/simdjson/error-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/error.h create mode 100644 contrib/libs/simdjson/include/simdjson/fallback.h create mode 100644 contrib/libs/simdjson/include/simdjson/fallback/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/fallback/begin.h create mode 100644 contrib/libs/simdjson/include/simdjson/fallback/bitmanipulation.h create mode 100644 contrib/libs/simdjson/include/simdjson/fallback/end.h create mode 100644 contrib/libs/simdjson/include/simdjson/fallback/implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/fallback/numberparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/fallback/ondemand.h create mode 100644 contrib/libs/simdjson/include/simdjson/fallback/stringparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/amalgamated.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/atomparsing.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/dependencies.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/dom_parser_implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/implementation_simdjson_result_base-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/implementation_simdjson_result_base.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/jsoncharutils.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/numberparsing.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/amalgamated.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/array-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/array.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/array_iterator-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/array_iterator.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/dependencies.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/document-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/document.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/document_stream-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/document_stream.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/field-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/field.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/json_iterator-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/json_iterator.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/json_type-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/json_type.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/logger-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/logger.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/object-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/object.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/object_iterator-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/object_iterator.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/parser-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/parser.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/raw_json_string-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/raw_json_string.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/serialization-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/serialization.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/token_iterator-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/token_iterator.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/value-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/value.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/value_iterator-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/generic/ondemand/value_iterator.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell/begin.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell/bitmanipulation.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell/bitmask.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell/end.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell/implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell/intrinsics.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell/numberparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell/ondemand.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell/simd.h create mode 100644 contrib/libs/simdjson/include/simdjson/haswell/stringparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake/begin.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake/bitmanipulation.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake/bitmask.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake/end.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake/implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake/intrinsics.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake/numberparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake/ondemand.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake/simd.h create mode 100644 contrib/libs/simdjson/include/simdjson/icelake/stringparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/implementation_detection.h create mode 100644 contrib/libs/simdjson/include/simdjson/internal/atomic_ptr.h create mode 100644 contrib/libs/simdjson/include/simdjson/internal/dom_parser_implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/internal/instruction_set.h create mode 100644 contrib/libs/simdjson/include/simdjson/internal/jsoncharutils_tables.h create mode 100644 contrib/libs/simdjson/include/simdjson/internal/jsonformatutils.h create mode 100644 contrib/libs/simdjson/include/simdjson/internal/numberparsing_tables.h create mode 100644 contrib/libs/simdjson/include/simdjson/internal/simdprune_tables.h create mode 100644 contrib/libs/simdjson/include/simdjson/internal/tape_ref-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/internal/tape_ref.h create mode 100644 contrib/libs/simdjson/include/simdjson/internal/tape_type.h create mode 100644 contrib/libs/simdjson/include/simdjson/jsonioutil.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx/begin.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx/bitmanipulation.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx/bitmask.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx/end.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx/implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx/intrinsics.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx/numberparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx/ondemand.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx/simd.h create mode 100644 contrib/libs/simdjson/include/simdjson/lasx/stringparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx/begin.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx/bitmanipulation.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx/bitmask.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx/end.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx/implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx/intrinsics.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx/numberparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx/ondemand.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx/simd.h create mode 100644 contrib/libs/simdjson/include/simdjson/lsx/stringparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/minify.h create mode 100644 contrib/libs/simdjson/include/simdjson/ondemand.h create mode 100644 contrib/libs/simdjson/include/simdjson/padded_string-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/padded_string.h create mode 100644 contrib/libs/simdjson/include/simdjson/padded_string_view-inl.h create mode 100644 contrib/libs/simdjson/include/simdjson/padded_string_view.h create mode 100644 contrib/libs/simdjson/include/simdjson/portability.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64/begin.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64/bitmanipulation.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64/bitmask.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64/end.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64/implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64/intrinsics.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64/numberparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64/ondemand.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64/simd.h create mode 100644 contrib/libs/simdjson/include/simdjson/ppc64/stringparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/simdjson.h create mode 100644 contrib/libs/simdjson/include/simdjson/simdjson_version.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere/base.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere/begin.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere/bitmanipulation.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere/bitmask.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere/end.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere/implementation.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere/intrinsics.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere/numberparsing_defs.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere/ondemand.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere/simd.h create mode 100644 contrib/libs/simdjson/include/simdjson/westmere/stringparsing_defs.h create mode 100644 contrib/libs/simdjson/src/arm64.cpp create mode 100644 contrib/libs/simdjson/src/base.h create mode 100644 contrib/libs/simdjson/src/fallback.cpp create mode 100644 contrib/libs/simdjson/src/from_chars.cpp create mode 100644 contrib/libs/simdjson/src/generic/amalgamated.h create mode 100644 contrib/libs/simdjson/src/generic/base.h create mode 100644 contrib/libs/simdjson/src/generic/dependencies.h create mode 100644 contrib/libs/simdjson/src/generic/dom_parser_implementation.h create mode 100644 contrib/libs/simdjson/src/generic/json_character_block.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/amalgamated.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/base.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/buf_block_reader.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/dependencies.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/find_next_document_index.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/json_escape_scanner.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/json_minifier.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/json_scanner.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/json_string_scanner.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/json_structural_indexer.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/utf8_lookup4_algorithm.h create mode 100644 contrib/libs/simdjson/src/generic/stage1/utf8_validator.h create mode 100644 contrib/libs/simdjson/src/generic/stage2/amalgamated.h create mode 100644 contrib/libs/simdjson/src/generic/stage2/base.h create mode 100644 contrib/libs/simdjson/src/generic/stage2/dependencies.h create mode 100644 contrib/libs/simdjson/src/generic/stage2/json_iterator.h create mode 100644 contrib/libs/simdjson/src/generic/stage2/logger.h create mode 100644 contrib/libs/simdjson/src/generic/stage2/stringparsing.h create mode 100644 contrib/libs/simdjson/src/generic/stage2/structural_iterator.h create mode 100644 contrib/libs/simdjson/src/generic/stage2/tape_builder.h create mode 100644 contrib/libs/simdjson/src/generic/stage2/tape_writer.h create mode 100644 contrib/libs/simdjson/src/haswell.cpp create mode 100644 contrib/libs/simdjson/src/icelake.cpp create mode 100644 contrib/libs/simdjson/src/implementation.cpp create mode 100644 contrib/libs/simdjson/src/internal/error_tables.cpp create mode 100644 contrib/libs/simdjson/src/internal/isadetection.h create mode 100644 contrib/libs/simdjson/src/internal/jsoncharutils_tables.cpp create mode 100644 contrib/libs/simdjson/src/internal/numberparsing_tables.cpp create mode 100644 contrib/libs/simdjson/src/internal/simdprune_tables.cpp create mode 100644 contrib/libs/simdjson/src/simdjson.cpp create mode 100644 contrib/libs/simdjson/src/to_chars.cpp create mode 100644 contrib/libs/simdjson/src/westmere.cpp create mode 100644 contrib/libs/simdjson/ya.make diff --git a/contrib/libs/simdjson/AUTHORS b/contrib/libs/simdjson/AUTHORS new file mode 100644 index 000000000000..e23c6beb4b3a --- /dev/null +++ b/contrib/libs/simdjson/AUTHORS @@ -0,0 +1,4 @@ +# List of authors for copyright purposes, in no particular order +Daniel Lemire +Geoff Langdale +John Keiser diff --git a/contrib/libs/simdjson/CONTRIBUTING.md b/contrib/libs/simdjson/CONTRIBUTING.md new file mode 100644 index 000000000000..a6b70a0bf246 --- /dev/null +++ b/contrib/libs/simdjson/CONTRIBUTING.md @@ -0,0 +1,103 @@ +Contributing +============ + +The simdjson library is an open project written in C++. Contributions are invited. Contributors +agree to the project's license. + +We have an extensive list of issues, and contributions toward any of these issues is invited. +Contributions can take the form of code samples, better documentation or design ideas. + +In particular, the following contributions are invited: + +- The library is focused on performance. Well-documented performance optimization are invited. +- Fixes to known or newly discovered bugs are always welcome. Typically, a bug fix should come with + a test demonstrating that the bug has been fixed. +- The simdjson library is advanced software and maintainability and flexibility are always a + concern. Specific contributions to improve maintainability and flexibility are invited. + +We discourage the following types of contributions: + +- Code refactoring. We all have our preferences as to how code should be written, but unnecessary + refactoring can waste time and introduce new bugs. If you believe that refactoring is needed, you + first must explain how it helps in concrete terms. Does it improve the performance? +- Applications of new language features for their own sake. Using advanced C++ language constructs + is actually a negative as it may reduce portability (to old compilers, old standard libraries and + systems) and reduce accessibility (to programmers that have not kept up), so it must be offsetted + by clear gains like performance or maintainability. When in doubt, avoid advanced C++ features + (beyond C++11). +- Style formatting. In general, please abstain from reformatting code just to make it look prettier. + Though code formatting is important, it can also be a waste of time if several contributors try to + tweak the code base toward their own preference. Please do not introduce unneeded white-space + changes. + +In short, most code changes should either bring new features or better performance. We want to avoid unmotivated code changes. + + +Specific rules +---------- + +We have few hard rules, but we have some: + +- Printing to standard output or standard error (`stderr`, `stdout`, `std::cerr`, `std::cout`) in the core library is forbidden. This follows from the [Writing R Extensions](https://cran.r-project.org/doc/manuals/R-exts.html) manual which states that "Compiled code should not write to stdout or stderr". +- Calls to `abort()` are forbidden in the core library. This follows from the [Writing R Extensions](https://cran.r-project.org/doc/manuals/R-exts.html) manual which states that "Under no circumstances should your compiled code ever call abort or exit". +- All source code files (.h, .cpp) must be ASCII. +- All C macros introduced in public headers need to be prefixed with either `SIMDJSON_` or `simdjson_`. +- We avoid trailing white space characters within lines. That is, your lines of code should not terminate with unnecessary spaces. Generally, please avoid making unnecessary changes to white-space characters when contributing code. + +Tools, tests and benchmarks are not held to these same strict rules. + +General Guidelines +---------- + +Contributors are encouraged to : + +- Document their changes. Though we do not enforce a rule regarding code comments, we prefer that non-trivial algorithms and techniques be somewhat documented in the code. +- Follow as much as possible the existing code style. We do not enforce a specific code style, but we prefer consistency. We avoid contractions (isn't, aren't) in the comments. +- Modify as few lines of code as possible when working on an issue. The more lines you modify, the harder it is for your fellow human beings to understand what is going on. +- Tools may report "problems" with the code, but we never delegate programming to tools: if there is a problem with the code, we need to understand it. Thus we will not "fix" code merely to please a static analyzer. +- Provide tests for any new feature. We will not merge a new feature without tests. +- Run before/after benchmarks so that we can appreciate the effect of the changes on the performance. + +Pull Requests +-------------- + +Pull requests are always invited. However, we ask that you follow these guidelines: + +- It is wise to discuss your ideas first as part of an issue before you start coding. If you omit this step and code first, be prepared to have your code receive scrutiny and be dropped. +- Users should provide a rationale for their changes. Does it improve performance? Does it add a feature? Does it improve maintainability? Does it fix a bug? This must be explicitly stated as part of the pull request. Do not propose changes based on taste or intuition. We do not delegate programming to tools: that some tool suggested a code change is not reason enough to change the code. + 1. When your code improves performance, please document the gains with a benchmark using hard numbers. + 2. If your code fixes a bug, please either fix a failing test, or propose a new test. + 3. Other types of changes must be clearly motivated. We openly discourage changes with no identifiable benefits. +- Changes should be focused and minimal. You should change as few lines of code as possible. Please do not reformat or touch files needlessly. +- New features must be accompanied by new tests, in general. +- Your code should pass our continuous-integration tests. It is your responsibility to ensure that your proposal pass the tests. We do not merge pull requests that would break our build. + - An exception to this would be changes to non-code files, such as documentation and assets, or trivial changes to code, such as comments, where it is encouraged to explicitly ask for skipping a CI run using the `[skip ci]` prefix in your Pull Request title **and** in the first line of the most recent commit in a push. Example for such a commit: `[skip ci] Fixed typo in power_of_ten's docs` + This benefits the project in such a way that the CI pipeline is not burdened by running jobs on changes that don't change any behavior in the code, which reduces wait times for other Pull Requests that do change behavior and require testing. + +If the benefits of your proposed code remain unclear, we may choose to discard your code: that is not an insult, we frequently discard our own code. We may also consider various alternatives and choose another path. Again, that is not an insult or a sign that you have wasted your time. + +Style +----- + +Our formatting style is inspired by the LLVM style. +The simdjson library is written using the snake case: when a variable or a function is a phrase, each space is replaced by an underscore character, and the first letter of each word written in lowercase. Compile-time constants are written entirely in uppercase with the same underscore convention. + +Code of Conduct +--------------- + +Though we do not have a formal code of conduct, we will not tolerate bullying, bigotry or +intimidation. Everyone is welcome to contribute. If you have concerns, you can raise them privately with the core team members (e.g., D. Lemire, J. Keiser). + +We welcome contributions from women and less represented groups. If you need help, please reach out. + +Consider the following points when engaging with the project: + +- We discourage arguments from authority: ideas are discusssed on their own merits and not based on who stated it. +- Be mindful that what you may view as an aggression is maybe merely a difference of opinion or a misunderstanding. +- Be mindful that a collection of small aggressions, even if mild in isolation, can become harmful. + +Getting Started Hacking +----------------------- + +An overview of simdjson's directory structure, with pointers to architecture and design +considerations and other helpful notes, can be found at [HACKING.md](HACKING.md). diff --git a/contrib/libs/simdjson/HACKING.md b/contrib/libs/simdjson/HACKING.md new file mode 100644 index 000000000000..8ee62672f7de --- /dev/null +++ b/contrib/libs/simdjson/HACKING.md @@ -0,0 +1,332 @@ + +Hacking simdjson +================ + +Here is wisdom about how to build, test and run simdjson from within the repository. This is mostly useful for people who plan to contribute simdjson, or maybe study the design. + +If you plan to contribute to simdjson, please read our [CONTRIBUTING](https://github.com/simdjson/simdjson/blob/master/CONTRIBUTING.md) guide. + +- [Hacking simdjson](#hacking-simdjson) + - [Build Quickstart](#build-quickstart) + - [Design notes](#design-notes) + - [Developer mode](#developer-mode) + - [Directory Structure and Source](#directory-structure-and-source) + - [Runtime Dispatching](#runtime-dispatching) + - [Regenerating Single-Header Files](#regenerating-single-header-files) + - [Usage (CMake on 64-bit platforms like Linux, FreeBSD or macOS)](#usage-cmake-on-64-bit-platforms-like-linux-freebsd-or-macos) + - [Usage (CMake on 64-bit Windows using Visual Studio 2019 or better)](#usage-cmake-on-64-bit-windows-using-visual-studio-2019-or-better) + - [Various References](#various-references) + +Build Quickstart +------------------------------ + +```bash +mkdir build +cd build +cmake -D SIMDJSON_DEVELOPER_MODE=ON .. +cmake --build . +``` + +Design notes +------------------------------ + +The parser works in two stages: + +- Stage 1. (Find marks) Identifies quickly structure elements, strings, and so forth. We validate UTF-8 encoding at that stage. +- Stage 2. (Structure building) Involves constructing a "tree" of sort (materialized as a tape) to navigate through the data. Strings and numbers are parsed at this stage. + + +The role of stage 1 is to identify pseudo-structural characters as quickly as possible. A character is pseudo-structural if and only if: + +1. Not enclosed in quotes, AND +2. Is a non-whitespace character, AND +3. Its preceding character is either: + (a) a structural character, OR + (b) whitespace OR + (c) the final quote in a string. + +This helps as we redefine some new characters as pseudo-structural such as the characters 1, G, n in the following: + +> { "foo" : 1.5, "bar" : 1.5 GEOFF_IS_A_DUMMY bla bla , "baz", null } + +Stage 1 also does unicode validation. + +Stage 2 handles all of the rest: number parsings, recognizing atoms like true, false, null, and so forth. + +Developer mode +-------------- + +Build system targets that are only useful for developers of the simdjson +library are behind the `SIMDJSON_DEVELOPER_MODE` option. Enabling this option +makes tests, examples, benchmarks and other developer targets available. Not +enabling this option means that you are a consumer of simdjson and thus you +only get the library targets and options. + +Developer mode is forced to be on when the `CI` environment variable is set to +a value that CMake recognizes as "on", which is set to `true` in all of the CI +workflows used by simdjson. + +Directory Structure and Source +------------------------------ + +simdjson's source structure, from the top level, looks like this: + +* **CMakeLists.txt:** The main build system. +* **include:** User-facing declarations and inline definitions (most user-facing functions are inlined). + * simdjson.h: the `simdjson` namespace. A "main include" that includes files from include/simdjson/. This is equivalent to + the distributed simdjson.h. + * simdjson/*.h: Declarations for public simdjson classes and functions. + * simdjson/*-inl.h: Definitions for public simdjson classes and functions. + * simdjson/internal/*.h: the `simdjson::internal` namespace. Private classes and functions used by the rest of simdjson. + * simdjson/dom.h: the `simdjson::dom` namespace. Includes all public DOM classes. + * simdjson/dom/*.h: Declarations/definitions for individual DOM classes. + * simdjson/arm64|fallback|haswell|icelake|ppc64|westmere.h: `simdjson::` namespace. Common implementation-specific tools like number and string parsing, as well as minification. + * simdjson/arm64|fallback|haswell|icelake|ppc64|westmere/*.h: implementation-specific functions such as , etc. + * simdjson/generic/*.h: the bulk of the actual code, written generically and compiled for each implementation, using functions defined in the implementation's .h files. + * simdjson/generic/dependencies.h: dependencies on common, non-implementation-specific simdjson classes. This will be included before including amalgamated.h. + * simdjson/generic/amalgamated.h: all generic ondemand classes for an implementation. + * simdjson/ondemand.h: the `simdjson::ondemand` namespace. Includes all public ondemand classes. + * simdjson/builtin.h: the `simdjson::builtin` namespace. Aliased to the most universal implementation available. + * simdjson/builtin/ondemand.h: the `simdjson::builtin::ondemand` namespace. + * simdjson/arm64|fallback|haswell|icelake|ppc64|westmere/ondemand.h: the `simdjson::::ondemand` namespace. On-Demand compiled for the specific implementation. + * simdjson/generic/ondemand/*.h: individual On-Demand classes, generically written. + * simdjson/generic/ondemand/dependencies.h: dependencies on common, non-implementation-specific simdjson classes. This will be included before including amalgamated.h. + * simdjson/generic/ondemand/amalgamated.h: all generic ondemand classes for an implementation. +* **src:** The source files for non-inlined functionality (e.g. the architecture-specific parser + implementations). + * simdjson.cpp: A "main source" that includes all implementation files from src/. This is + equivalent to the distributed simdjson.cpp. + * *.cpp: other misc. implementations, such as `simdjson::implementation` and the minifier. + * arm64|fallback|haswell|icelake|ppc64|westmere.cpp: Architecture-specific parser implementations. + * generic/*.h: `simdjson::` namespace. Generic implementation of the parser, particularly the `dom_parser_implementation`. + * generic/stage1/*.h: `simdjson::::stage1` namespace. Generic implementation of the simd-heavy tokenizer/indexer pass of the simdjson parser. Used for the On-Demand interface + * generic/stage2/*.h: `simdjson::::stage2` namespace. Generic implementation of the tape creator, which consumes the index from stage 1 and actually parses numbers and string and such. Used for the DOM interface. + +Other important files and directories: +* **.drone.yml:** Definitions for Drone CI. +* **.appveyor.yml:** Definitions for Appveyor CI (Windows). +* **.circleci:** Definitions for Circle CI. +* **.github/workflows:** Definitions for GitHub Actions (CI). +* **singleheader:** Contains generated `simdjson.h` and `simdjson.cpp` that we release. The files `singleheader/simdjson.h` and `singleheader/simdjson.cpp` should never be edited by hand. +* **singleheader/amalgamate.py:** Generates `singleheader/simdjson.h` and `singleheader/simdjson.cpp` for release (python script). +* **benchmark:** This is where we do benchmarking. Benchmarking is core to every change we make; the + cardinal rule is don't regress performance without knowing exactly why, and what you're trading + for it. Many of our benchmarks are microbenchmarks. We are effectively doing controlled scientific experiments for the purpose of understanding what affects our performance. So we simplify as much as possible. We try to avoid irrelevant factors such as page faults, interrupts, unnecessary system calls. We recommend checking the performance as follows: + ```bash + mkdir build + cd build + cmake -D SIMDJSON_DEVELOPER_MODE=ON .. + cmake --build . --config Release + benchmark/dom/parse ../jsonexamples/twitter.json + ``` + The last line becomes `./benchmark/Release/parse.exe ../jsonexample/twitter.json` under Windows. You may also use Google Benchmark: + ```bash + mkdir build + cd build + cmake -D SIMDJSON_DEVELOPER_MODE=ON .. + cmake --build . --target bench_parse_call --config Release + ./benchmark/bench_parse_call + ``` + The last line becomes `./benchmark/Release/bench_parse_call.exe` under Windows. Under Windows, you can also build with the clang compiler by adding `-T ClangCL` to the call to `cmake ..`: `cmake -T ClangCL ..`. +* **fuzz:** The source for fuzz testing. This lets us explore important edge and middle cases +* **fuzz:** The source for fuzz testing. This lets us explore important edge and middle cases + automatically, and is run in CI. +* **jsonchecker:** A set of JSON files used to check different functionality of the parser. + * **pass*.json:** Files that should pass validation. + * **fail*.json:** Files that should fail validation. + * **jsonchecker/minefield/y_*.json:** Files that should pass validation. + * **jsonchecker/minefield/n_*.json:** Files that should fail validation. +* **jsonexamples:** A wide spread of useful, real-world JSON files with different characteristics + and sizes. +* **test:** The tests are here. basictests.cpp and errortests.cpp are the primary ones. +* **tools:** Source for executables that can be distributed with simdjson. Some examples: + * `json2json mydoc.json` parses the document, constructs a model and then dumps back the result to standard output. + * `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file `tape.md`. + * `minify mydoc.json` minifies the JSON document, outputting the result to standard output. Minifying means to remove the unneeded white space characters. + * `jsonpointer mydoc.json ... ` parses the document, constructs a model and then processes a series of [JSON Pointer paths](https://tools.ietf.org/html/rfc6901). The result is itself a JSON document. + + +> **Don't modify the files in singleheader/ directly; these are automatically generated.** + + +While simdjson distributes just two files from the singleheader/ directory, we *maintain* the code in +multiple files under include/ and src/. The files include/simdjson.h and src/simdjson.cpp are the "spine" for +these, and you can include them as if they were the corresponding singleheader/ files. + + + +Runtime Dispatching +-------------------- + +A key feature of simdjson is the ability to compile different processing kernels, optimized for specific instruction sets, and to select +the most appropriate kernel at runtime. This ensures that users get the very best performance while still enabling simdjson to run everywhere. +This technique is frequently called runtime dispatching. The simdjson achieves runtime dispatching entirely in C++: we do not assume +that the user is building the code using CMake, for example. + +To make runtime dispatching work, it is critical that the code be compiled for the lowest supported processor. In particular, you should +not use flags such as -mavx2, /arch:AVX2 and so forth while compiling simdjson. When you do so, you allow the compiler to use advanced +instructions. In turn, these advanced instructions present in the code may cause a runtime failure if the runtime processor does not +support them. Even a simple loop, compiled with these flags, might generate binary code that only run on advanced processors. + +So we compile simdjson for a generic processor. Our users should do the same if they want simdjson's runtime dispatch to work. It is important +to understand that if runtime dispatching does not work, then simdjson will cause crashes on older processors. Of course, if a user chooses +to compile their code for a specific instruction set (e.g., AVX2), they are responsible for the failures if they later run their code +on a processor that does not support AVX2. Yet, if we were to entice these users to do so, we would share the blame: thus we carefully instruct +users to compile their code in a generic way without doing anything to enable advanced instructions. + + +We only use runtime dispatching on x64 (AMD/Intel) platforms, at the moment. On ARM processors, we would need a standard way to query, at runtime, +the processor for its supported features. We do not know how to do so on ARM systems in general. Thankfully it is not yet a concern: 64-bit ARM +processors are fairly uniform as far as the instruction sets they support. + + +In all cases, simdjson uses advanced instructions by relying on "intrinsic functions": we do not write assembly code. The intrinsic functions +are special functions that the compiler might recognize and translate into fast code. To make runtime dispatching work, we rely on the fact that +the header providing these instructions +(intrin.h under Visual Studio, x86intrin.h elsewhere) defines all of the intrinsic functions, including those that are not supported +processor. + +At this point, we are require to use one of two main strategies. + +1. On POSIX systems, the main compilers (LLVM clang, GNU gcc) allow us to use any intrinsic function after including the header, but they fail to inline the resulting instruction if the target processor does not support them. Because we compile for a generic processor, we would not be able to use most intrinsic functions. Thankfully, more recent versions of these compilers allow us to flag a region of code with a specific target, so that we can compile only some of the code with support for advanced instructions. Thus in our C++, one might notice macros like `TARGET_HASWELL`. It is then our responsibility, at runtime, to only run the regions of code (that we call kernels) matching the properties of the runtime processor. The benefit of this approach is that the compiler not only let us use intrinsic functions, but it can also optimize the rest of the code in the kernel with advanced instructions we enabled. + +2. Under Visual Studio, the problem is somewhat simpler. Visual Studio will not only provide the intrinsic functions, but it will also allow us to use them. They will compile just fine. It is at runtime that they may cause a crash. So we do not need to mark regions of code for compilation toward advanced processors (e.g., with `TARGET_HASWELL` macros). The downside of the Visual Studio approach is that the compiler is not allowed to use advanced instructions others than those we specify. In principle, this means that Visual Studio has weaker optimization opportunities. + + + +We also handle the special case where a user is compiling using LLVM clang under Windows, [using the Visual Studio toolchain](https://devblogs.microsoft.com/cppblog/clang-llvm-support-in-visual-studio/). If you compile with LLVM clang under Visual Studio, then the header files (intrin.h or x86intrin.h) no longer provides the intrinsic functions that are unsupported by the processor. This appears to be deliberate on the part of the LLVM engineers. With a few lines of code, we handle this scenario just like LLVM clang under a POSIX system, but forcing the inclusion of the specific headers, and rolling our own intrinsic function as needed. + + + + + +Regenerating Single-Header Files +--------------------------------------- + +The simdjson.h and simdjson.cpp files in the singleheader directory are not always up-to-date with the rest of the code; they are only ever +systematically regenerated on releases. To ensure you have the latest code, you can regenerate them by running this at the top level: + +```bash +mkdir build +cd build +cmake -D SIMDJSON_DEVELOPER_MODE=ON .. +cmake --build . # needed, because currently dependencies do not work fully for the amalgamate target +cmake --build . --target amalgamate +``` + +You need to have python3 installed on your system. + +The amalgamator script `amalgamate.py` generates singleheader/simdjson.h by +reading through include/simdjson.h, copy/pasting each header file into the amalgamated file at the +point it gets included (but only once per header). singleheader/simdjson.cpp is generated from +src/simdjson.cpp the same way, except files under generic/ may be included and copy/pasted multiple +times. + +## Usage (CMake on 64-bit platforms like Linux, FreeBSD or macOS) + +Requirements: In addition to git, we require a recent version of CMake as well as bash. + +1. On macOS, the easiest way to install cmake might be to use [brew](https://brew.sh) and then type +``` +brew install cmake +``` +2. Under Linux, you might be able to install CMake as follows: +``` +apt-get update -qq +apt-get install -y cmake +``` +3. On FreeBSD, you might be able to install bash and CMake as follows: +``` +pkg update -f +pkg install bash +pkg install cmake +``` + +You need a recent compiler like clang or gcc. We recommend at least GNU GCC/G++ 7 or LLVM clang 6. + + +Building: While in the project repository, do the following: + +``` +mkdir build +cd build +cmake -D SIMDJSON_DEVELOPER_MODE=ON .. +cmake --build . +ctest +``` + +CMake will build a library. By default, it builds a static library (e.g., libsimdjson.a on Linux). + +You can build a shared library: + +``` +mkdir buildshared +cd buildshared +cmake -D BUILD_SHARED_LIBS=ON -D SIMDJSON_DEVELOPER_MODE=ON .. +cmake --build . +ctest +``` + +In some cases, you may want to specify your compiler, especially if the default compiler on your system is too old. You need to tell cmake which compiler you wish to use by setting the CC and CXX variables. Under bash, you can do so with commands such as `export CC=gcc-7` and `export CXX=g++-7`. You can also do it as part of the `cmake` command: `cmake -DCMAKE_CXX_COMPILER=g++ ..`. You may proceed as follows: + +``` +brew install gcc@8 +mkdir build +cd build +export CXX=g++-8 CC=gcc-8 +cmake -D SIMDJSON_DEVELOPER_MODE=ON .. +cmake --build . +ctest +``` + +If your compiler does not default on C++11 support or better you may get failing tests. If so, you may be able to exclude the failing tests by replacing `ctest` with `ctest -E "^quickstart$"`. + +Note that the name of directory (`build`) is arbitrary, you can name it as you want (e.g., `buildgcc`) and you can have as many different such directories as you would like (one per configuration). + +## Usage (CMake on 64-bit Windows using Visual Studio 2019 or better) + +Recent versions of Visual Studio support CMake natively, [please refer to the Visual Studio documentation](https://learn.microsoft.com/en-us/cpp/build/cmake-projects-in-visual-studio?view=msvc-170). + +We assume you have a common 64-bit Windows PC with at least Visual Studio 2019. + +- Grab the simdjson code from GitHub, e.g., by cloning it using [GitHub Desktop](https://desktop.github.com/). +- Install [CMake](https://cmake.org/download/). When you install it, make sure to ask that `cmake` be made available from the command line. Please choose a recent version of cmake. +- Create a subdirectory within simdjson, such as `build`. +- Using a shell, go to this newly created directory. You can start a shell directly from GitHub Desktop (Repository > Open in Command Prompt). +- Type `cmake ..` in the shell while in the `build` repository. +- This last command (`cmake ...`) created a Visual Studio solution file in the newly created directory (e.g., `simdjson.sln`). Open this file in Visual Studio. You should now be able to build the project and run the tests. For example, in the `Solution Explorer` window (available from the `View` menu), right-click `ALL_BUILD` and select `Build`. To test the code, still in the `Solution Explorer` window, select `RUN_TESTS` and select `Build`. + + +Though having Visual Studio installed is necessary, one can build simdjson using only cmake commands: + +- `mkdir build` +- `cd build` +- `cmake ..` +- `cmake --build . --config Release` + + +Furthermore, if you have installed LLVM clang on Windows, for example as a component of Visual Studio 2019, you can configure and build simdjson using LLVM clang on Windows using cmake: + +- `mkdir build` +- `cd build` +- `cmake -T ClangCL ..` +- `cmake --build . --config Release` + +## Various References + +- [How to implement atoi using SIMD?](https://stackoverflow.com/questions/35127060/how-to-implement-atoi-using-simd) +- [Parsing JSON is a Minefield 💣](http://seriot.ch/parsing_json.php) +- https://tools.ietf.org/html/rfc7159 +- http://rapidjson.org/md_doc_sax.html +- https://github.com/Geal/parser_benchmarks/tree/master/json +- Gron: A command line tool that makes JSON greppable https://news.ycombinator.com/item?id=16727665 +- GoogleGson https://github.com/google/gson +- Jackson https://github.com/FasterXML/jackson +- https://www.yelp.com/dataset_challenge +- RapidJSON. http://rapidjson.org/ + +Inspiring links: + +- https://auth0.com/blog/beating-json-performance-with-protobuf/ +- https://gist.github.com/shijuvar/25ad7de9505232c87034b8359543404a +- https://github.com/frankmcsherry/blog/blob/master/posts/2018-02-11.md diff --git a/contrib/libs/simdjson/LICENSE b/contrib/libs/simdjson/LICENSE new file mode 100644 index 000000000000..71f65b598d90 --- /dev/null +++ b/contrib/libs/simdjson/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2018-2023 The simdjson authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/contrib/libs/simdjson/README.md b/contrib/libs/simdjson/README.md new file mode 100644 index 000000000000..b92c22bb7553 --- /dev/null +++ b/contrib/libs/simdjson/README.md @@ -0,0 +1,227 @@ + +[![Ubuntu 20.04 CI](https://github.com/simdjson/simdjson/workflows/Ubuntu%2020.04%20CI%20(GCC%209)/badge.svg)](https://simdjson.org/plots.html) +[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/simdjson.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:simdjson) +[![][license img]][license] + +[![Doxygen Documentation](https://img.shields.io/badge/docs-doxygen-green.svg)](https://simdjson.github.io/simdjson/) + +simdjson : Parsing gigabytes of JSON per second +=============================================== + + +JSON is everywhere on the Internet. Servers spend a *lot* of time parsing it. We need a fresh +approach. The simdjson library uses commonly available SIMD instructions and microparallel algorithms +to parse JSON 4x faster than RapidJSON and 25x faster than JSON for Modern C++. + +* **Fast:** Over 4x faster than commonly used production-grade JSON parsers. +* **Record Breaking Features:** Minify JSON at 6 GB/s, validate UTF-8 at 13 GB/s, NDJSON at 3.5 GB/s. +* **Easy:** First-class, easy to use and carefully documented APIs. +* **Strict:** Full JSON and UTF-8 validation, lossless parsing. Performance with no compromises. +* **Automatic:** Selects a CPU-tailored parser at runtime. No configuration needed. +* **Reliable:** From memory allocation to error handling, simdjson's design avoids surprises. +* **Peer Reviewed:** Our research appears in venues like VLDB Journal, Software: Practice and Experience. + +This library is part of the [Awesome Modern C++](https://awesomecpp.com) list. + +Table of Contents +----------------- + +* [Real-world usage](#real-world-usage) +* [Quick Start](#quick-start) +* [Documentation](#documentation) +* [Godbolt](#godbolt) +* [Performance results](#performance-results) +* [Bindings and Ports of simdjson](#bindings-and-ports-of-simdjson) +* [About simdjson](#about-simdjson) +* [Funding](#funding) +* [Contributing to simdjson](#contributing-to-simdjson) +* [License](#license) + + +Real-world usage +---------------- + +- [Node.js](https://nodejs.org/) +- [ClickHouse](https://github.com/ClickHouse/ClickHouse) +- [Meta Velox](https://velox-lib.io) +- [Google Pax](https://github.com/google/paxml) +- [milvus](https://github.com/milvus-io/milvus) +- [QuestDB](https://questdb.io/blog/questdb-release-8-0-3/) +- [Clang Build Analyzer](https://github.com/aras-p/ClangBuildAnalyzer) +- [Shopify HeapProfiler](https://github.com/Shopify/heap-profiler) +- [StarRocks](https://github.com/StarRocks/starrocks) +- [Microsoft FishStore](https://github.com/microsoft/FishStore) +- [Intel PCM](https://github.com/intel/pcm) +- [WatermelonDB](https://github.com/Nozbe/WatermelonDB) +- [Apache Doris](https://github.com/apache/doris) +- [Dgraph](https://github.com/dgraph-io/dgraph) +- [UJRPC](https://github.com/unum-cloud/ujrpc) +- [fastgltf](https://github.com/spnda/fastgltf) +- [vast](https://github.com/tenzir/vast) +- [ada-url](https://github.com/ada-url/ada) +- [fastgron](https://github.com/adamritter/fastgron) +- [WasmEdge](https://wasmedge.org) + +If you are planning to use simdjson in a product, please work from one of our releases. + +Quick Start +----------- + +The simdjson library is easily consumable with a single .h and .cpp file. + +0. Prerequisites: `g++` (version 7 or better) or `clang++` (version 6 or better), and a 64-bit + system with a command-line shell (e.g., Linux, macOS, freeBSD). We also support programming + environments like Visual Studio and Xcode, but different steps are needed. Users of clang++ may need to specify the C++ version (e.g., `c++ -std=c++17`) since clang++ tends to default on C++98. +1. Pull [simdjson.h](singleheader/simdjson.h) and [simdjson.cpp](singleheader/simdjson.cpp) into a + directory, along with the sample file [twitter.json](jsonexamples/twitter.json). You can download them with the `wget` utility: + + ``` + wget https://raw.githubusercontent.com/simdjson/simdjson/master/singleheader/simdjson.h https://raw.githubusercontent.com/simdjson/simdjson/master/singleheader/simdjson.cpp https://raw.githubusercontent.com/simdjson/simdjson/master/jsonexamples/twitter.json + ``` +2. Create `quickstart.cpp`: + +```c++ +#include +#include "simdjson.h" +using namespace simdjson; +int main(void) { + ondemand::parser parser; + padded_string json = padded_string::load("twitter.json"); + ondemand::document tweets = parser.iterate(json); + std::cout << uint64_t(tweets["search_metadata"]["count"]) << " results." << std::endl; +} +``` +3. `c++ -o quickstart quickstart.cpp simdjson.cpp` +4. `./quickstart` + + ``` + 100 results. + ``` + + +Documentation +------------- + +Usage documentation is available: + +* [Basics](doc/basics.md) is an overview of how to use simdjson and its APIs. +* [Performance](doc/performance.md) shows some more advanced scenarios and how to tune for them. +* [Implementation Selection](doc/implementation-selection.md) describes runtime CPU detection and + how you can work with it. +* [API](https://simdjson.github.io/simdjson/) contains the automatically generated API documentation. + +Godbolt +------------- + +Some users may want to browse code along with the compiled assembly. You want to check out the following lists of examples: +* [simdjson examples with errors handled through exceptions](https://godbolt.org/z/7G5qE4sr9) +* [simdjson examples with errors without exceptions](https://godbolt.org/z/e9dWb9E4v) + +Performance results +------------------- + +The simdjson library uses three-quarters less instructions than state-of-the-art parser [RapidJSON](https://rapidjson.org). To our knowledge, simdjson is the first fully-validating JSON parser +to run at [gigabytes per second](https://en.wikipedia.org/wiki/Gigabyte) (GB/s) on commodity processors. It can parse millions of JSON documents per second on a single core. + +The following figure represents parsing speed in GB/s for parsing various files +on an Intel Skylake processor (3.4 GHz) using the GNU GCC 10 compiler (with the -O3 flag). +We compare against the best and fastest C++ libraries on benchmarks that load and process the data. +The simdjson library offers full unicode ([UTF-8](https://en.wikipedia.org/wiki/UTF-8)) validation and exact +number parsing. + + + +The simdjson library offers high speed whether it processes tiny files (e.g., 300 bytes) +or larger files (e.g., 3MB). The following plot presents parsing +speed for [synthetic files over various sizes generated with a script](https://github.com/simdjson/simdjson_experiments_vldb2019/blob/master/experiments/growing/gen.py) on a 3.4 GHz Skylake processor (GNU GCC 9, -O3). + + + +[All our experiments are reproducible](https://github.com/simdjson/simdjson_experiments_vldb2019). + + +For NDJSON files, we can exceed 3 GB/s with [our multithreaded parsing functions](https://github.com/simdjson/simdjson/blob/master/doc/parse_many.md). + + + + +Bindings and Ports of simdjson +------------------------------ + +We distinguish between "bindings" (which just wrap the C++ code) and a port to another programming language (which reimplements everything). + +- [ZippyJSON](https://github.com/michaeleisel/zippyjson): Swift bindings for the simdjson project. +- [libpy_simdjson](https://github.com/gerrymanoim/libpy_simdjson/): high-speed Python bindings for simdjson using [libpy](https://github.com/quantopian/libpy). +- [pysimdjson](https://github.com/TkTech/pysimdjson): Python bindings for the simdjson project. +- [cysimdjson](https://github.com/TeskaLabs/cysimdjson): high-speed Python bindings for the simdjson project. +- [simdjson-rs](https://github.com/simd-lite): Rust port. +- [simdjson-rust](https://github.com/SunDoge/simdjson-rust): Rust wrapper (bindings). +- [SimdJsonSharp](https://github.com/EgorBo/SimdJsonSharp): C# version for .NET Core (bindings and full port). +- [simdjson_nodejs](https://github.com/luizperes/simdjson_nodejs): Node.js bindings for the simdjson project. +- [simdjson_php](https://github.com/crazyxman/simdjson_php): PHP bindings for the simdjson project. +- [simdjson_ruby](https://github.com/saka1/simdjson_ruby): Ruby bindings for the simdjson project. +- [fast_jsonparser](https://github.com/anilmaurya/fast_jsonparser): Ruby bindings for the simdjson project. +- [simdjson-go](https://github.com/minio/simdjson-go): Go port using Golang assembly. +- [rcppsimdjson](https://github.com/eddelbuettel/rcppsimdjson): R bindings. +- [simdjson_erlang](https://github.com/ChomperT/simdjson_erlang): erlang bindings. +- [simdjsone](https://github.com/saleyn/simdjsone): erlang bindings. +- [lua-simdjson](https://github.com/FourierTransformer/lua-simdjson): lua bindings. +- [hermes-json](https://hackage.haskell.org/package/hermes-json): haskell bindings. +- [simdjzon](https://github.com/travisstaloch/simdjzon): zig port. +- [JSON-Simd](https://github.com/rawleyfowler/JSON-simd): Raku bindings. +- [JSON::SIMD](https://metacpan.org/pod/JSON::SIMD): Perl bindings; fully-featured JSON module that uses simdjson for decoding. +- [gemmaJSON](https://github.com/sainttttt/gemmaJSON): Nim JSON parser based on simdjson bindings. +- [simdjson-java](https://github.com/simdjson/simdjson-java): Java port. + +About simdjson +-------------- + +The simdjson library takes advantage of modern microarchitectures, parallelizing with SIMD vector +instructions, reducing branch misprediction, and reducing data dependency to take advantage of each +CPU's multiple execution cores. + +Our default front-end is called On-Demand, and we wrote a paper about it: + +- John Keiser, Daniel Lemire, [On-Demand JSON: A Better Way to Parse Documents?](http://arxiv.org/abs/2312.17149), Software: Practice and Experience 54 (6), 2024. + +Some people [enjoy reading the first (2019) simdjson paper](https://arxiv.org/abs/1902.08318): A description of the design +and implementation of simdjson is in our research article: +- Geoff Langdale, Daniel Lemire, [Parsing Gigabytes of JSON per Second](https://arxiv.org/abs/1902.08318), VLDB Journal 28 (6), 2019. + +We have an in-depth paper focused on the UTF-8 validation: + +- John Keiser, Daniel Lemire, [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), Software: Practice & Experience 51 (5), 2021. + +We also have an informal [blog post providing some background and context](https://branchfree.org/2019/02/25/paper-parsing-gigabytes-of-json-per-second/). + +For the video inclined,
+[![simdjson at QCon San Francisco 2019](http://img.youtube.com/vi/wlvKAT7SZIQ/0.jpg)](http://www.youtube.com/watch?v=wlvKAT7SZIQ)
+(It was the best voted talk, we're kinda proud of it.) + +Funding +------- + +The work is supported by the Natural Sciences and Engineering Research Council of Canada under grants +RGPIN-2017-03910 and RGPIN-2024-03787. + +[license]: LICENSE +[license img]: https://img.shields.io/badge/License-Apache%202-blue.svg + +Contributing to simdjson +------------------------ + +Head over to [CONTRIBUTING.md](CONTRIBUTING.md) for information on contributing to simdjson, and +[HACKING.md](HACKING.md) for information on source, building, and architecture/design. + +License +------- + +This code is made available under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0.html). + +Under Windows, we build some tools using the windows/dirent_portable.h file (which is outside our library code): it is under the liberal (business-friendly) MIT license. + +For compilers that do not support [C++17](https://en.wikipedia.org/wiki/C%2B%2B17), we bundle the string-view library which is published under the [Boost license](http://www.boost.org/LICENSE_1_0.txt). Like the Apache license, the Boost license is a permissive license allowing commercial redistribution. + +For efficient number serialization, we bundle Florian Loitsch's implementation of the Grisu2 algorithm for binary to decimal floating-point numbers. The implementation was slightly modified by JSON for Modern C++ library. Both Florian Loitsch's implementation and JSON for Modern C++ are provided under the MIT license. + +For runtime dispatching, we use some code from the PyTorch project licensed under 3-clause BSD. diff --git a/contrib/libs/simdjson/SECURITY.md b/contrib/libs/simdjson/SECURITY.md new file mode 100644 index 000000000000..87c4c9b35e31 --- /dev/null +++ b/contrib/libs/simdjson/SECURITY.md @@ -0,0 +1,7 @@ +# Security Policy + +## Reporting a Vulnerability + +Please use the following contact information for reporting a vulnerability: + +- [Daniel Lemire](https://github.com/lemire) - daniel@lemire.me diff --git a/contrib/libs/simdjson/include/simdjson.h b/contrib/libs/simdjson/include/simdjson.h new file mode 100644 index 000000000000..f77ab12b3ec5 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson.h @@ -0,0 +1,57 @@ +#ifndef SIMDJSON_H +#define SIMDJSON_H + +/** + * @mainpage + * + * Check the [README.md](https://github.com/simdjson/simdjson/blob/master/README.md#simdjson--parsing-gigabytes-of-json-per-second). + * + * Sample code. See https://github.com/simdjson/simdjson/blob/master/doc/basics.md for more examples. + + #include "simdjson.h" + + int main(void) { + // load from `twitter.json` file: + simdjson::dom::parser parser; + simdjson::dom::element tweets = parser.load("twitter.json"); + std::cout << tweets["search_metadata"]["count"] << " results." << std::endl; + + // Parse and iterate through an array of objects + auto abstract_json = R"( [ + { "12345" : {"a":12.34, "b":56.78, "c": 9998877} }, + { "12545" : {"a":11.44, "b":12.78, "c": 11111111} } + ] )"_padded; + + for (simdjson::dom::object obj : parser.parse(abstract_json)) { + for(const auto key_value : obj) { + cout << "key: " << key_value.key << " : "; + simdjson::dom::object innerobj = key_value.value; + cout << "a: " << double(innerobj["a"]) << ", "; + cout << "b: " << double(innerobj["b"]) << ", "; + cout << "c: " << int64_t(innerobj["c"]) << endl; + } + } + } + */ + +#include "simdjson/common_defs.h" + +// This provides the public API for simdjson. +// DOM and ondemand are amalgamated separately, in simdjson.h +#include "simdjson/simdjson_version.h" + +#include "simdjson/base.h" + +#include "simdjson/error.h" +#include "simdjson/error-inl.h" +#include "simdjson/implementation.h" +#include "simdjson/minify.h" +#include "simdjson/padded_string.h" +#include "simdjson/padded_string-inl.h" +#include "simdjson/padded_string_view.h" +#include "simdjson/padded_string_view-inl.h" + +#include "simdjson/dom.h" +#include "simdjson/ondemand.h" + +#endif // SIMDJSON_H diff --git a/contrib/libs/simdjson/include/simdjson/arm64.h b/contrib/libs/simdjson/include/simdjson/arm64.h new file mode 100644 index 000000000000..1493c3562a5d --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_ARM64_H +#define SIMDJSON_ARM64_H + +#include "simdjson/arm64/begin.h" +#include "simdjson/generic/amalgamated.h" +#include "simdjson/arm64/end.h" + +#endif // SIMDJSON_ARM64_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/arm64/base.h b/contrib/libs/simdjson/include/simdjson/arm64/base.h new file mode 100644 index 000000000000..8f23d18d80e4 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64/base.h @@ -0,0 +1,26 @@ +#ifndef SIMDJSON_ARM64_BASE_H +#define SIMDJSON_ARM64_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +/** + * Implementation for NEON (ARMv8). + */ +namespace arm64 { + +class implementation; + +namespace { +namespace simd { +template struct simd8; +template struct simd8x64; +} // namespace simd +} // unnamed namespace + +} // namespace arm64 +} // namespace simdjson + +#endif // SIMDJSON_ARM64_BASE_H diff --git a/contrib/libs/simdjson/include/simdjson/arm64/begin.h b/contrib/libs/simdjson/include/simdjson/arm64/begin.h new file mode 100644 index 000000000000..ee48cec05150 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64/begin.h @@ -0,0 +1,10 @@ +#define SIMDJSON_IMPLEMENTATION arm64 +#include "simdjson/arm64/base.h" +#include "simdjson/arm64/intrinsics.h" +#include "simdjson/arm64/bitmanipulation.h" +#include "simdjson/arm64/bitmask.h" +#include "simdjson/arm64/numberparsing_defs.h" +#include "simdjson/arm64/simd.h" +#include "simdjson/arm64/stringparsing_defs.h" + +#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1 \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/arm64/bitmanipulation.h b/contrib/libs/simdjson/include/simdjson/arm64/bitmanipulation.h new file mode 100644 index 000000000000..019869b1b0b2 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64/bitmanipulation.h @@ -0,0 +1,106 @@ +#ifndef SIMDJSON_ARM64_BITMANIPULATION_H +#define SIMDJSON_ARM64_BITMANIPULATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/arm64/base.h" +#include "simdjson/arm64/intrinsics.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace arm64 { +namespace { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +SIMDJSON_NO_SANITIZE_UNDEFINED +// This function can be used safely even if not all bytes have been +// initialized. +// See issue https://github.com/simdjson/simdjson/issues/1965 +SIMDJSON_NO_SANITIZE_MEMORY +simdjson_inline int trailing_zeroes(uint64_t input_num) { +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + unsigned long ret; + // Search the mask data from least significant bit (LSB) + // to the most significant bit (MSB) for a set bit (1). + _BitScanForward64(&ret, input_num); + return (int)ret; +#else // SIMDJSON_REGULAR_VISUAL_STUDIO + return __builtin_ctzll(input_num); +#endif // SIMDJSON_REGULAR_VISUAL_STUDIO +} + +/* result might be undefined when input_num is zero */ +simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return input_num & (input_num-1); +} + +/* result might be undefined when input_num is zero */ +simdjson_inline int leading_zeroes(uint64_t input_num) { +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + if (_BitScanReverse64(&leading_zero, input_num)) + return (int)(63 - leading_zero); + else + return 64; +#else + return __builtin_clzll(input_num); +#endif// SIMDJSON_REGULAR_VISUAL_STUDIO +} + +/* result might be undefined when input_num is zero */ +simdjson_inline int count_ones(uint64_t input_num) { + return vaddv_u8(vcnt_u8(vcreate_u8(input_num))); +} + + +#if defined(__GNUC__) // catches clang and gcc +/** + * ARM has a fast 64-bit "bit reversal function" that is handy. However, + * it is not generally available as an intrinsic function under Visual + * Studio (though this might be changing). Even under clang/gcc, we + * apparently need to invoke inline assembly. + */ +/* + * We use SIMDJSON_PREFER_REVERSE_BITS as a hint that algorithms that + * work well with bit reversal may use it. + */ +#define SIMDJSON_PREFER_REVERSE_BITS 1 + +/* reverse the bits */ +simdjson_inline uint64_t reverse_bits(uint64_t input_num) { + uint64_t rev_bits; + __asm("rbit %0, %1" : "=r"(rev_bits) : "r"(input_num)); + return rev_bits; +} + +/** + * Flips bit at index 63 - lz. Thus if you have 'leading_zeroes' leading zeroes, + * then this will set to zero the leading bit. It is possible for leading_zeroes to be + * greating or equal to 63 in which case we trigger undefined behavior, but the output + * of such undefined behavior is never used. + **/ +SIMDJSON_NO_SANITIZE_UNDEFINED +simdjson_inline uint64_t zero_leading_bit(uint64_t rev_bits, int leading_zeroes) { + return rev_bits ^ (uint64_t(0x8000000000000000) >> leading_zeroes); +} + +#endif + +simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + *result = value1 + value2; + return *result < value1; +#else + return __builtin_uaddll_overflow(value1, value2, + reinterpret_cast(result)); +#endif +} + +} // unnamed namespace +} // namespace arm64 +} // namespace simdjson + +#endif // SIMDJSON_ARM64_BITMANIPULATION_H diff --git a/contrib/libs/simdjson/include/simdjson/arm64/bitmask.h b/contrib/libs/simdjson/include/simdjson/arm64/bitmask.h new file mode 100644 index 000000000000..5d6121bcc7b1 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64/bitmask.h @@ -0,0 +1,44 @@ +#ifndef SIMDJSON_ARM64_BITMASK_H +#define SIMDJSON_ARM64_BITMASK_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/arm64/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace arm64 { +namespace { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +simdjson_inline uint64_t prefix_xor(uint64_t bitmask) { + ///////////// + // We could do this with PMULL, but it is apparently slow. + // + //#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension + //return vmull_p64(-1ULL, bitmask); + //#else + // Analysis by @sebpop: + // When diffing the assembly for src/stage1_find_marks.cpp I see that the eors are all spread out + // in between other vector code, so effectively the extra cycles of the sequence do not matter + // because the GPR units are idle otherwise and the critical path is on the FP side. + // Also the PMULL requires two extra fmovs: GPR->FP (3 cycles in N1, 5 cycles in A72 ) + // and FP->GPR (2 cycles on N1 and 5 cycles on A72.) + /////////// + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + return bitmask; +} + +} // unnamed namespace +} // namespace arm64 +} // namespace simdjson + +#endif diff --git a/contrib/libs/simdjson/include/simdjson/arm64/end.h b/contrib/libs/simdjson/include/simdjson/arm64/end.h new file mode 100644 index 000000000000..b92378df97fd --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64/end.h @@ -0,0 +1,6 @@ +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/arm64/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT +#undef SIMDJSON_IMPLEMENTATION diff --git a/contrib/libs/simdjson/include/simdjson/arm64/implementation.h b/contrib/libs/simdjson/include/simdjson/arm64/implementation.h new file mode 100644 index 000000000000..c9b7dd753507 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64/implementation.h @@ -0,0 +1,31 @@ +#ifndef SIMDJSON_ARM64_IMPLEMENTATION_H +#define SIMDJSON_ARM64_IMPLEMENTATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/base.h" +#include "simdjson/implementation.h" +#include "simdjson/internal/instruction_set.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace arm64 { + +/** + * @private + */ +class implementation final : public simdjson::implementation { +public: + simdjson_inline implementation() : simdjson::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {} + simdjson_warn_unused error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final; + simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; +}; + +} // namespace arm64 +} // namespace simdjson + +#endif // SIMDJSON_ARM64_IMPLEMENTATION_H diff --git a/contrib/libs/simdjson/include/simdjson/arm64/intrinsics.h b/contrib/libs/simdjson/include/simdjson/arm64/intrinsics.h new file mode 100644 index 000000000000..049d99861020 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64/intrinsics.h @@ -0,0 +1,14 @@ +#ifndef SIMDJSON_ARM64_INTRINSICS_H +#define SIMDJSON_ARM64_INTRINSICS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/arm64/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// This should be the correct header whether +// you use visual studio or other compilers. +#include + +static_assert(sizeof(uint8x16_t) <= simdjson::SIMDJSON_PADDING, "insufficient padding for arm64"); + +#endif // SIMDJSON_ARM64_INTRINSICS_H diff --git a/contrib/libs/simdjson/include/simdjson/arm64/numberparsing_defs.h b/contrib/libs/simdjson/include/simdjson/arm64/numberparsing_defs.h new file mode 100644 index 000000000000..f5e85b747790 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64/numberparsing_defs.h @@ -0,0 +1,56 @@ +#ifndef SIMDJSON_ARM64_NUMBERPARSING_DEFS_H +#define SIMDJSON_ARM64_NUMBERPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/arm64/base.h" +#include "simdjson/arm64/intrinsics.h" +#include "simdjson/internal/numberparsing_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include + +#if SIMDJSON_REGULAR_VISUAL_STUDIO && SIMDJSON_IS_ARM64 +// __umulh requires intrin.h +#include +#endif // SIMDJSON_REGULAR_VISUAL_STUDIO && SIMDJSON_IS_ARM64 + +namespace simdjson { +namespace arm64 { +namespace numberparsing { + +// we don't have SSE, so let us use a scalar function +// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ +/** @private */ +static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) { + uint64_t val; + std::memcpy(&val, chars, sizeof(uint64_t)); + val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8; + val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16; + return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32); +} + +simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) { + internal::value128 answer; +#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS +#if SIMDJSON_IS_ARM64 + // ARM64 has native support for 64-bit multiplications, no need to emultate + answer.high = __umulh(value1, value2); + answer.low = value1 * value2; +#else + answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64 +#endif // SIMDJSON_IS_ARM64 +#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS + __uint128_t r = (static_cast<__uint128_t>(value1)) * value2; + answer.low = uint64_t(r); + answer.high = uint64_t(r >> 64); +#endif + return answer; +} + +} // namespace numberparsing +} // namespace arm64 +} // namespace simdjson + +#define SIMDJSON_SWAR_NUMBER_PARSING 1 + +#endif // SIMDJSON_ARM64_NUMBERPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/arm64/ondemand.h b/contrib/libs/simdjson/include/simdjson/arm64/ondemand.h new file mode 100644 index 000000000000..67134394675e --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64/ondemand.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_ARM64_ONDEMAND_H +#define SIMDJSON_ARM64_ONDEMAND_H + +#include "simdjson/arm64/begin.h" +#include "simdjson/generic/ondemand/amalgamated.h" +#include "simdjson/arm64/end.h" + +#endif // SIMDJSON_ARM64_ONDEMAND_H diff --git a/contrib/libs/simdjson/include/simdjson/arm64/simd.h b/contrib/libs/simdjson/include/simdjson/arm64/simd.h new file mode 100644 index 000000000000..3b0fa844f496 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64/simd.h @@ -0,0 +1,497 @@ +#ifndef SIMDJSON_ARM64_SIMD_H +#define SIMDJSON_ARM64_SIMD_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/arm64/base.h" +#include "simdjson/arm64/bitmanipulation.h" +#include "simdjson/internal/simdprune_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace arm64 { +namespace { +namespace simd { + +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO +namespace { +// Start of private section with Visual Studio workaround + + +#ifndef simdjson_make_uint8x16_t +#define simdjson_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \ + x13, x14, x15, x16) \ + ([=]() { \ + uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \ + x9, x10, x11, x12, x13, x14, x15, x16}; \ + return vld1q_u8(array); \ + }()) +#endif +#ifndef simdjson_make_int8x16_t +#define simdjson_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \ + x13, x14, x15, x16) \ + ([=]() { \ + int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \ + x9, x10, x11, x12, x13, x14, x15, x16}; \ + return vld1q_s8(array); \ + }()) +#endif + +#ifndef simdjson_make_uint8x8_t +#define simdjson_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ + ([=]() { \ + uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ + return vld1_u8(array); \ + }()) +#endif +#ifndef simdjson_make_int8x8_t +#define simdjson_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ + ([=]() { \ + int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ + return vld1_s8(array); \ + }()) +#endif +#ifndef simdjson_make_uint16x8_t +#define simdjson_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ + ([=]() { \ + uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ + return vld1q_u16(array); \ + }()) +#endif +#ifndef simdjson_make_int16x8_t +#define simdjson_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \ + ([=]() { \ + int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \ + return vld1q_s16(array); \ + }()) +#endif + +// End of private section with Visual Studio workaround +} // namespace +#endif // SIMDJSON_REGULAR_VISUAL_STUDIO + + + template + struct simd8; + + // + // Base class of simd8 and simd8, both of which use uint8x16_t internally. + // + template> + struct base_u8 { + uint8x16_t value; + static const int SIZE = sizeof(value); + + // Conversion from/to SIMD register + simdjson_inline base_u8(const uint8x16_t _value) : value(_value) {} + simdjson_inline operator const uint8x16_t&() const { return this->value; } + simdjson_inline operator uint8x16_t&() { return this->value; } + + // Bit operations + simdjson_inline simd8 operator|(const simd8 other) const { return vorrq_u8(*this, other); } + simdjson_inline simd8 operator&(const simd8 other) const { return vandq_u8(*this, other); } + simdjson_inline simd8 operator^(const simd8 other) const { return veorq_u8(*this, other); } + simdjson_inline simd8 bit_andnot(const simd8 other) const { return vbicq_u8(*this, other); } + simdjson_inline simd8 operator~() const { return *this ^ 0xFFu; } + simdjson_inline simd8& operator|=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast | other; return *this_cast; } + simdjson_inline simd8& operator&=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast & other; return *this_cast; } + simdjson_inline simd8& operator^=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast ^ other; return *this_cast; } + + friend simdjson_inline Mask operator==(const simd8 lhs, const simd8 rhs) { return vceqq_u8(lhs, rhs); } + + template + simdjson_inline simd8 prev(const simd8 prev_chunk) const { + return vextq_u8(prev_chunk, *this, 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base_u8 { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + static simdjson_inline simd8 splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); } + + simdjson_inline simd8(const uint8x16_t _value) : base_u8(_value) {} + // False constructor + simdjson_inline simd8() : simd8(vdupq_n_u8(0)) {} + // Splat constructor + simdjson_inline simd8(bool _value) : simd8(splat(_value)) {} + + // We return uint32_t instead of uint16_t because that seems to be more efficient for most + // purposes (cutting it down to uint16_t costs performance in some compilers). + simdjson_inline uint32_t to_bitmask() const { +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = simdjson_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); +#else + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; +#endif + auto minput = *this & bit_mask; + uint8x16_t tmp = vpaddq_u8(minput, minput); + tmp = vpaddq_u8(tmp, tmp); + tmp = vpaddq_u8(tmp, tmp); + return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); + } + simdjson_inline bool any() const { return vmaxvq_u32(vreinterpretq_u32_u8(*this)) != 0; } + }; + + // Unsigned bytes + template<> + struct simd8: base_u8 { + static simdjson_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(_value); } + static simdjson_inline uint8x16_t zero() { return vdupq_n_u8(0); } + static simdjson_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); } + + simdjson_inline simd8(const uint8x16_t _value) : base_u8(_value) {} + // Zero constructor + simdjson_inline simd8() : simd8(zero()) {} + // Array constructor + simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} + // Splat constructor + simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Member-by-member initialization +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + simdjson_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(simdjson_make_uint8x16_t( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} +#else + simdjson_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(uint8x16_t{ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + }) {} +#endif + + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Store to array + simdjson_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); } + + // Saturated math + simdjson_inline simd8 saturating_add(const simd8 other) const { return vqaddq_u8(*this, other); } + simdjson_inline simd8 saturating_sub(const simd8 other) const { return vqsubq_u8(*this, other); } + + // Addition/subtraction are the same for signed and unsigned + simdjson_inline simd8 operator+(const simd8 other) const { return vaddq_u8(*this, other); } + simdjson_inline simd8 operator-(const simd8 other) const { return vsubq_u8(*this, other); } + simdjson_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } + simdjson_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + + // Order-specific operations + simdjson_inline uint8_t max_val() const { return vmaxvq_u8(*this); } + simdjson_inline uint8_t min_val() const { return vminvq_u8(*this); } + simdjson_inline simd8 max_val(const simd8 other) const { return vmaxq_u8(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return vminq_u8(*this, other); } + simdjson_inline simd8 operator<=(const simd8 other) const { return vcleq_u8(*this, other); } + simdjson_inline simd8 operator>=(const simd8 other) const { return vcgeq_u8(*this, other); } + simdjson_inline simd8 operator<(const simd8 other) const { return vcltq_u8(*this, other); } + simdjson_inline simd8 operator>(const simd8 other) const { return vcgtq_u8(*this, other); } + // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's. + simdjson_inline simd8 gt_bits(const simd8 other) const { return simd8(*this > other); } + // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's. + simdjson_inline simd8 lt_bits(const simd8 other) const { return simd8(*this < other); } + + // Bit-specific operations + simdjson_inline simd8 any_bits_set(simd8 bits) const { return vtstq_u8(*this, bits); } + simdjson_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; } + simdjson_inline bool any_bits_set_anywhere(simd8 bits) const { return (*this & bits).any_bits_set_anywhere(); } + template + simdjson_inline simd8 shr() const { return vshrq_n_u8(*this, N); } + template + simdjson_inline simd8 shl() const { return vshlq_n_u8(*this, N); } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdjson_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } + + + // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). + // Passing a 0 value for mask would be equivalent to writing out every byte to output. + // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes + // get written. + // Design consideration: it seems like a function with the + // signature simd8 compress(uint16_t mask) would be + // sensible, but the AVX ISA makes this kind of approach difficult. + template + simdjson_inline void compress(uint16_t mask, L * output) const { + using internal::thintable_epi8; + using internal::BitsSetTable256mul2; + using internal::pshufb_combine_table; + // this particular implementation was inspired by work done by @animetosho + // we do it in two steps, first 8 bytes and then second 8 bytes + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. + uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]}; + uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64); + // we increment by 0x08 the second half of the mask +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + uint8x16_t inc = simdjson_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08); +#else + uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08}; +#endif + shufmask = vaddq_u8(shufmask, inc); + // this is the version "nearly pruned" + uint8x16_t pruned = vqtbl1q_u8(*this, shufmask); + // we still need to put the two halves together. + // we compute the popcount of the first half: + int pop1 = BitsSetTable256mul2[mask1]; + // then load the corresponding mask, what it does is to write + // only the first pop1 bytes from the first 8 bytes, and then + // it fills in with the bytes from the second 8 bytes + some filling + // at the end. + uint8x16_t compactmask = vld1q_u8(reinterpret_cast(pshufb_combine_table + pop1 * 8)); + uint8x16_t answer = vqtbl1q_u8(pruned, compactmask); + vst1q_u8(reinterpret_cast(output), answer); + } + + // Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a + // bitset) to output1, then those corresponding to a 0 in the high half to output2. + template + simdjson_inline void compress_halves(uint16_t mask, L *output1, L *output2) const { + using internal::thintable_epi8; + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits + uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]); + uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]); + // we increment by 0x08 the second half of the mask +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + uint8x8_t inc = simdjson_make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08); +#else + uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08}; +#endif + compactmask2 = vadd_u8(compactmask2, inc); + // store each result (with the second store possibly overlapping the first) + vst1_u8((uint8_t*)output1, vqtbl1_u8(*this, compactmask1)); + vst1_u8((uint8_t*)output2, vqtbl1_u8(*this, compactmask2)); + } + + template + simdjson_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + + template + simdjson_inline simd8 apply_lookup_16_to(const simd8 original) { + return vqtbl1q_u8(*this, simd8(original)); + } + }; + + // Signed bytes + template<> + struct simd8 { + int8x16_t value; + + static simdjson_inline simd8 splat(int8_t _value) { return vmovq_n_s8(_value); } + static simdjson_inline simd8 zero() { return vdupq_n_s8(0); } + static simdjson_inline simd8 load(const int8_t values[16]) { return vld1q_s8(values); } + + // Conversion from/to SIMD register + simdjson_inline simd8(const int8x16_t _value) : value{_value} {} + simdjson_inline operator const int8x16_t&() const { return this->value; } + simdjson_inline operator int8x16_t&() { return this->value; } + + // Zero constructor + simdjson_inline simd8() : simd8(zero()) {} + // Splat constructor + simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {} + // Member-by-member initialization +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + simdjson_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(simdjson_make_int8x16_t( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} +#else + simdjson_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(int8x16_t{ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + }) {} +#endif + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Store to array + simdjson_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); } + + // Explicit conversion to/from unsigned + // + // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type. + // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14 + // and relatively ugly and hard to read. +#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO + simdjson_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {} +#endif + simdjson_inline explicit operator simd8() const { return vreinterpretq_u8_s8(this->value); } + + // Math + simdjson_inline simd8 operator+(const simd8 other) const { return vaddq_s8(*this, other); } + simdjson_inline simd8 operator-(const simd8 other) const { return vsubq_s8(*this, other); } + simdjson_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } + simdjson_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + + // Order-sensitive comparisons + simdjson_inline simd8 max_val(const simd8 other) const { return vmaxq_s8(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return vminq_s8(*this, other); } + simdjson_inline simd8 operator>(const simd8 other) const { return vcgtq_s8(*this, other); } + simdjson_inline simd8 operator<(const simd8 other) const { return vcltq_s8(*this, other); } + simdjson_inline simd8 operator==(const simd8 other) const { return vceqq_s8(*this, other); } + + template + simdjson_inline simd8 prev(const simd8 prev_chunk) const { + return vextq_s8(prev_chunk, *this, 16 - N); + } + + // Perform a lookup assuming no value is larger than 16 + template + simdjson_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } + template + simdjson_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + + template + simdjson_inline simd8 apply_lookup_16_to(const simd8 original) { + return vqtbl1q_s8(*this, simd8(original)); + } + }; + + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8& other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdjson_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr), simd8::load(ptr+16), simd8::load(ptr+32), simd8::load(ptr+48)} {} + + simdjson_inline void store(T ptr[64]) const { + this->chunks[0].store(ptr+sizeof(simd8)*0); + this->chunks[1].store(ptr+sizeof(simd8)*1); + this->chunks[2].store(ptr+sizeof(simd8)*2); + this->chunks[3].store(ptr+sizeof(simd8)*3); + } + + simdjson_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); + } + + + simdjson_inline uint64_t compress(uint64_t mask, T * output) const { + uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0); + // compute the prefix sum of the popcounts of each byte + uint64_t offsets = popcounts * 0x0101010101010101; + this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]); + this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]); + this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]); + this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]); + return offsets >> 56; + } + + simdjson_inline uint64_t to_bitmask() const { +#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = simdjson_make_uint8x16_t( + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + ); +#else + const uint8x16_t bit_mask = { + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + }; +#endif + // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. + uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask); + uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); + } + + simdjson_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] == mask, + this->chunks[1] == mask, + this->chunks[2] == mask, + this->chunks[3] == mask + ).to_bitmask(); + } + + simdjson_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] <= mask, + this->chunks[1] <= mask, + this->chunks[2] <= mask, + this->chunks[3] <= mask + ).to_bitmask(); + } + }; // struct simd8x64 + +} // namespace simd +} // unnamed namespace +} // namespace arm64 +} // namespace simdjson + +#endif // SIMDJSON_ARM64_SIMD_H diff --git a/contrib/libs/simdjson/include/simdjson/arm64/stringparsing_defs.h b/contrib/libs/simdjson/include/simdjson/arm64/stringparsing_defs.h new file mode 100644 index 000000000000..30d02faff3f9 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/arm64/stringparsing_defs.h @@ -0,0 +1,53 @@ +#ifndef SIMDJSON_ARM64_STRINGPARSING_DEFS_H +#define SIMDJSON_ARM64_STRINGPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/arm64/base.h" +#include "simdjson/arm64/simd.h" +#include "simdjson/arm64/bitmanipulation.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace arm64 { +namespace { + +using namespace simd; + +// Holds backslashes and quotes locations. +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 32; + simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst); + + simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; } + simdjson_inline bool has_backslash() { return bs_bits != 0; } + simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); } + simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); } + + uint32_t bs_bits; + uint32_t quote_bits; +}; // struct backslash_and_quote + +simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { + // this can read up to 31 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes"); + simd8 v0(src); + simd8 v1(src + sizeof(v0)); + v0.store(dst); + v1.store(dst + sizeof(v0)); + + // Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on ARM; therefore, we + // smash them together into a 64-byte mask and get the bitmask from there. + uint64_t bs_and_quote = simd8x64(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask(); + return { + uint32_t(bs_and_quote), // bs_bits + uint32_t(bs_and_quote >> 32) // quote_bits + }; +} + +} // unnamed namespace +} // namespace arm64 +} // namespace simdjson + +#endif // SIMDJSON_ARM64_STRINGPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/base.h b/contrib/libs/simdjson/include/simdjson/base.h new file mode 100644 index 000000000000..a73b84bd082e --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/base.h @@ -0,0 +1,60 @@ +/** + * @file Base declarations for all simdjson headers + * @private + */ +#ifndef SIMDJSON_BASE_H +#define SIMDJSON_BASE_H + +#include "simdjson/common_defs.h" +#include "simdjson/compiler_check.h" +#include "simdjson/error.h" +#include "simdjson/portability.h" + +/** + * @brief The top level simdjson namespace, containing everything the library provides. + */ +namespace simdjson { + +SIMDJSON_PUSH_DISABLE_UNUSED_WARNINGS + +/** The maximum document size supported by simdjson. */ +constexpr size_t SIMDJSON_MAXSIZE_BYTES = 0xFFFFFFFF; + +/** + * The amount of padding needed in a buffer to parse JSON. + * + * The input buf should be readable up to buf + SIMDJSON_PADDING + * this is a stopgap; there should be a better description of the + * main loop and its behavior that abstracts over this + * See https://github.com/simdjson/simdjson/issues/174 + */ +constexpr size_t SIMDJSON_PADDING = 64; + +/** + * By default, simdjson supports this many nested objects and arrays. + * + * This is the default for parser::max_depth(). + */ +constexpr size_t DEFAULT_MAX_DEPTH = 1024; + +SIMDJSON_POP_DISABLE_UNUSED_WARNINGS + +class implementation; +struct padded_string; +class padded_string_view; +enum class stage1_mode; + +namespace internal { + +template +class atomic_ptr; +class dom_parser_implementation; +class escape_json_string; +class tape_ref; +struct value128; +enum class tape_type; + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_BASE_H diff --git a/contrib/libs/simdjson/include/simdjson/builtin.h b/contrib/libs/simdjson/include/simdjson/builtin.h new file mode 100644 index 000000000000..4788007f88c9 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/builtin.h @@ -0,0 +1,33 @@ +#ifndef SIMDJSON_BUILTIN_H +#define SIMDJSON_BUILTIN_H + +#include "simdjson/builtin/base.h" +#include "simdjson/builtin/implementation.h" + +#include "simdjson/generic/dependencies.h" + +#define SIMDJSON_CONDITIONAL_INCLUDE + +#if SIMDJSON_BUILTIN_IMPLEMENTATION_IS(arm64) +#include "simdjson/arm64.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(fallback) +#include "simdjson/fallback.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(haswell) +#include "simdjson/haswell.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(icelake) +#include "simdjson/icelake.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(ppc64) +#include "simdjson/ppc64.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(westmere) +#include "simdjson/westmere.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(lsx) +#include "simdjson/lsx.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(lasx) +#include "simdjson/lasx.h" +#else +#error Unknown SIMDJSON_BUILTIN_IMPLEMENTATION +#endif + +#undef SIMDJSON_CONDITIONAL_INCLUDE + +#endif // SIMDJSON_BUILTIN_H diff --git a/contrib/libs/simdjson/include/simdjson/builtin/base.h b/contrib/libs/simdjson/include/simdjson/builtin/base.h new file mode 100644 index 000000000000..ce1678013e48 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/builtin/base.h @@ -0,0 +1,41 @@ +#ifndef SIMDJSON_BUILTIN_BASE_H +#define SIMDJSON_BUILTIN_BASE_H + +#include "simdjson/base.h" +#include "simdjson/implementation_detection.h" + +namespace simdjson { +#if SIMDJSON_BUILTIN_IMPLEMENTATION_IS(arm64) + namespace arm64 {} +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(fallback) + namespace fallback {} +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(haswell) + namespace haswell {} +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(icelake) + namespace icelake {} +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(ppc64) + namespace ppc64 {} +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(westmere) + namespace westmere {} +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(lsx) + namespace lsx {} +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(lasx) + namespace lasx {} +#else +#error Unknown SIMDJSON_BUILTIN_IMPLEMENTATION +#endif + + /** + * Represents the best statically linked simdjson implementation that can be used by the compiling + * program. + * + * Detects what options the program is compiled against, and picks the minimum implementation that + * will work on any computer that can run the program. For example, if you compile with g++ + * -march=westmere, it will pick the westmere implementation. The haswell implementation will + * still be available, and can be selected at runtime, but the builtin implementation (and any + * code that uses it) will use westmere. + */ + namespace builtin = SIMDJSON_BUILTIN_IMPLEMENTATION; +} // namespace simdjson + +#endif // SIMDJSON_BUILTIN_BASE_H diff --git a/contrib/libs/simdjson/include/simdjson/builtin/implementation.h b/contrib/libs/simdjson/include/simdjson/builtin/implementation.h new file mode 100644 index 000000000000..68a175d07ae3 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/builtin/implementation.h @@ -0,0 +1,42 @@ +#ifndef SIMDJSON_BUILTIN_IMPLEMENTATION_H +#define SIMDJSON_BUILTIN_IMPLEMENTATION_H + +#include "simdjson/builtin/base.h" + +#include "simdjson/generic/dependencies.h" + +#define SIMDJSON_CONDITIONAL_INCLUDE + +#if SIMDJSON_BUILTIN_IMPLEMENTATION_IS(arm64) +#include "simdjson/arm64/implementation.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(fallback) +#include "simdjson/fallback/implementation.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(haswell) +#include "simdjson/haswell/implementation.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(icelake) +#include "simdjson/icelake/implementation.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(ppc64) +#error #include "simdjson/ppc64/implementation.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(westmere) +#include "simdjson/westmere/implementation.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(lsx) +#include "simdjson/lsx/implementation.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(lasx) +#include "simdjson/lasx/implementation.h" +#else +#error Unknown SIMDJSON_BUILTIN_IMPLEMENTATION +#endif + +#undef SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { + /** + * Function which returns a pointer to an implementation matching the "builtin" implementation. + * The builtin implementation is the best statically linked simdjson implementation that can be used by the compiling + * program. If you compile with g++ -march=haswell, this will return the haswell implementation. + * It is handy to be able to check what builtin was used: builtin_implementation()->name(). + */ + const implementation * builtin_implementation(); +} // namespace simdjson + +#endif // SIMDJSON_BUILTIN_IMPLEMENTATION_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/builtin/ondemand.h b/contrib/libs/simdjson/include/simdjson/builtin/ondemand.h new file mode 100644 index 000000000000..483fa8760adf --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/builtin/ondemand.h @@ -0,0 +1,40 @@ +#ifndef SIMDJSON_BUILTIN_ONDEMAND_H +#define SIMDJSON_BUILTIN_ONDEMAND_H + +#include "simdjson/builtin.h" +#include "simdjson/builtin/base.h" + +#include "simdjson/generic/ondemand/dependencies.h" + +#define SIMDJSON_CONDITIONAL_INCLUDE + +#if SIMDJSON_BUILTIN_IMPLEMENTATION_IS(arm64) +#include "simdjson/arm64/ondemand.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(fallback) +#include "simdjson/fallback/ondemand.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(haswell) +#include "simdjson/haswell/ondemand.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(icelake) +#include "simdjson/icelake/ondemand.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(ppc64) +#error #include "simdjson/ppc64/ondemand.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(westmere) +#include "simdjson/westmere/ondemand.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(lsx) +#include "simdjson/lsx/ondemand.h" +#elif SIMDJSON_BUILTIN_IMPLEMENTATION_IS(lasx) +#include "simdjson/lasx/ondemand.h" +#else +#error Unknown SIMDJSON_BUILTIN_IMPLEMENTATION +#endif + +#undef SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { + /** + * @copydoc simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand + */ + namespace ondemand = SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand; +} // namespace simdjson + +#endif // SIMDJSON_BUILTIN_ONDEMAND_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/common_defs.h b/contrib/libs/simdjson/include/simdjson/common_defs.h new file mode 100644 index 000000000000..d0ae083ecba9 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/common_defs.h @@ -0,0 +1,347 @@ +#ifndef SIMDJSON_COMMON_DEFS_H +#define SIMDJSON_COMMON_DEFS_H + +#include +#include "simdjson/compiler_check.h" +#include "simdjson/portability.h" + +namespace simdjson { +namespace internal { +/** + * @private + * Our own implementation of the C++17 to_chars function. + * Defined in src/to_chars + */ +char *to_chars(char *first, const char *last, double value); +/** + * @private + * A number parsing routine. + * Defined in src/from_chars + */ +double from_chars(const char *first) noexcept; +double from_chars(const char *first, const char* end) noexcept; +} + +#ifndef SIMDJSON_EXCEPTIONS +#if __cpp_exceptions +#define SIMDJSON_EXCEPTIONS 1 +#else +#define SIMDJSON_EXCEPTIONS 0 +#endif +#endif + +} // namespace simdjson + +#if defined(__GNUC__) + // Marks a block with a name so that MCA analysis can see it. + #define SIMDJSON_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name); + #define SIMDJSON_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name); + #define SIMDJSON_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name); +#else + #define SIMDJSON_BEGIN_DEBUG_BLOCK(name) + #define SIMDJSON_END_DEBUG_BLOCK(name) + #define SIMDJSON_DEBUG_BLOCK(name, block) +#endif + +// Align to N-byte boundary +#define SIMDJSON_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1)) +#define SIMDJSON_ROUNDDOWN_N(a, n) ((a) & ~((n)-1)) + +#define SIMDJSON_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0) + +#if SIMDJSON_REGULAR_VISUAL_STUDIO + // We could use [[deprecated]] but it requires C++14 + #define simdjson_deprecated __declspec(deprecated) + + #define simdjson_really_inline __forceinline + #define simdjson_never_inline __declspec(noinline) + + #define simdjson_unused + #define simdjson_warn_unused + + #ifndef simdjson_likely + #define simdjson_likely(x) x + #endif + #ifndef simdjson_unlikely + #define simdjson_unlikely(x) x + #endif + + #define SIMDJSON_PUSH_DISABLE_WARNINGS __pragma(warning( push )) + #define SIMDJSON_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 )) + #define SIMDJSON_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER )) + // Get rid of Intellisense-only warnings (Code Analysis) + // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910). + #ifdef __has_include + #if __has_include() + #error #include + #define SIMDJSON_DISABLE_UNDESIRED_WARNINGS SIMDJSON_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS) + #endif + #endif + + #ifndef SIMDJSON_DISABLE_UNDESIRED_WARNINGS + #define SIMDJSON_DISABLE_UNDESIRED_WARNINGS + #endif + + #define SIMDJSON_DISABLE_DEPRECATED_WARNING SIMDJSON_DISABLE_VS_WARNING(4996) + #define SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING + #define SIMDJSON_POP_DISABLE_WARNINGS __pragma(warning( pop )) + + #define SIMDJSON_PUSH_DISABLE_UNUSED_WARNINGS + #define SIMDJSON_POP_DISABLE_UNUSED_WARNINGS + +#else // SIMDJSON_REGULAR_VISUAL_STUDIO + // We could use [[deprecated]] but it requires C++14 + #define simdjson_deprecated __attribute__((deprecated)) + + #define simdjson_really_inline inline __attribute__((always_inline)) + #define simdjson_never_inline inline __attribute__((noinline)) + + #define simdjson_unused __attribute__((unused)) + #define simdjson_warn_unused __attribute__((warn_unused_result)) + + #ifndef simdjson_likely + #define simdjson_likely(x) __builtin_expect(!!(x), 1) + #endif + #ifndef simdjson_unlikely + #define simdjson_unlikely(x) __builtin_expect(!!(x), 0) + #endif + + #define SIMDJSON_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push") + // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary + // We do it separately for clang since it has different warnings. + #ifdef __clang__ + // clang is missing -Wmaybe-uninitialized. + #define SIMDJSON_PUSH_DISABLE_ALL_WARNINGS SIMDJSON_PUSH_DISABLE_WARNINGS \ + SIMDJSON_DISABLE_GCC_WARNING(-Weffc++) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wall) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wconversion) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wextra) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wattributes) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wreturn-type) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wshadow) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wunused-parameter) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wunused-variable) + #else // __clang__ + #define SIMDJSON_PUSH_DISABLE_ALL_WARNINGS SIMDJSON_PUSH_DISABLE_WARNINGS \ + SIMDJSON_DISABLE_GCC_WARNING(-Weffc++) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wall) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wconversion) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wextra) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wattributes) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wreturn-type) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wshadow) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wunused-parameter) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wunused-variable) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) \ + SIMDJSON_DISABLE_GCC_WARNING(-Wformat-security) + #endif // __clang__ + + #define SIMDJSON_PRAGMA(P) _Pragma(#P) + #define SIMDJSON_DISABLE_GCC_WARNING(WARNING) SIMDJSON_PRAGMA(GCC diagnostic ignored #WARNING) + #if SIMDJSON_CLANG_VISUAL_STUDIO + #define SIMDJSON_DISABLE_UNDESIRED_WARNINGS SIMDJSON_DISABLE_GCC_WARNING(-Wmicrosoft-include) + #else + #define SIMDJSON_DISABLE_UNDESIRED_WARNINGS + #endif + #define SIMDJSON_DISABLE_DEPRECATED_WARNING SIMDJSON_DISABLE_GCC_WARNING(-Wdeprecated-declarations) + #define SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING SIMDJSON_DISABLE_GCC_WARNING(-Wstrict-overflow) + #define SIMDJSON_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop") + + #define SIMDJSON_PUSH_DISABLE_UNUSED_WARNINGS SIMDJSON_PUSH_DISABLE_WARNINGS \ + SIMDJSON_DISABLE_GCC_WARNING(-Wunused) + #define SIMDJSON_POP_DISABLE_UNUSED_WARNINGS SIMDJSON_POP_DISABLE_WARNINGS + + + +#endif // MSC_VER + +#if defined(simdjson_inline) + // Prefer the user's definition of simdjson_inline; don't define it ourselves. +#elif defined(__GNUC__) && !defined(__OPTIMIZE__) + // If optimizations are disabled, forcing inlining can lead to significant + // code bloat and high compile times. Don't use simdjson_really_inline for + // unoptimized builds. + #define simdjson_inline inline +#else + // Force inlining for most simdjson functions. + #define simdjson_inline simdjson_really_inline +#endif + +#if SIMDJSON_VISUAL_STUDIO + /** + * Windows users need to do some extra work when building + * or using a dynamic library (DLL). When building, we need + * to set SIMDJSON_DLLIMPORTEXPORT to __declspec(dllexport). + * When *using* the DLL, the user needs to set + * SIMDJSON_DLLIMPORTEXPORT __declspec(dllimport). + * + * Static libraries not need require such work. + * + * It does not matter here whether you are using + * the regular visual studio or clang under visual + * studio, you still need to handle these issues. + * + * Non-Windows systems do not have this complexity. + */ + #if SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY + // We set SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY when we build a DLL under Windows. + // It should never happen that both SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY and + // SIMDJSON_USING_WINDOWS_DYNAMIC_LIBRARY are set. + #define SIMDJSON_DLLIMPORTEXPORT __declspec(dllexport) + #elif SIMDJSON_USING_WINDOWS_DYNAMIC_LIBRARY + // Windows user who call a dynamic library should set SIMDJSON_USING_WINDOWS_DYNAMIC_LIBRARY to 1. + #define SIMDJSON_DLLIMPORTEXPORT __declspec(dllimport) + #else + // We assume by default static linkage + #define SIMDJSON_DLLIMPORTEXPORT + #endif + +/** + * Workaround for the vcpkg package manager. Only vcpkg should + * ever touch the next line. The SIMDJSON_USING_LIBRARY macro is otherwise unused. + */ +#if SIMDJSON_USING_LIBRARY +#define SIMDJSON_DLLIMPORTEXPORT __declspec(dllimport) +#endif +/** + * End of workaround for the vcpkg package manager. + */ +#else + #define SIMDJSON_DLLIMPORTEXPORT +#endif + +// C++17 requires string_view. +#if SIMDJSON_CPLUSPLUS17 +#define SIMDJSON_HAS_STRING_VIEW +#include // by the standard, this has to be safe. +#endif + +// This macro (__cpp_lib_string_view) has to be defined +// for C++17 and better, but if it is otherwise defined, +// we are going to assume that string_view is available +// even if we do not have C++17 support. +#ifdef __cpp_lib_string_view +#define SIMDJSON_HAS_STRING_VIEW +#endif + +// Some systems have string_view even if we do not have C++17 support, +// and even if __cpp_lib_string_view is undefined, it is the case +// with Apple clang version 11. +// We must handle it. *This is important.* +#ifndef SIMDJSON_HAS_STRING_VIEW +#if defined __has_include +// do not combine the next #if with the previous one (unsafe) +#if __has_include () +// now it is safe to trigger the include +#include // though the file is there, it does not follow that we got the implementation +#if defined(_LIBCPP_STRING_VIEW) +// Ah! So we under libc++ which under its Library Fundamentals Technical Specification, which preceded C++17, +// included string_view. +// This means that we have string_view *even though* we may not have C++17. +#define SIMDJSON_HAS_STRING_VIEW +#endif // _LIBCPP_STRING_VIEW +#endif // __has_include () +#endif // defined __has_include +#endif // def SIMDJSON_HAS_STRING_VIEW +// end of complicated but important routine to try to detect string_view. + +// +// Backfill std::string_view using nonstd::string_view on systems where +// we expect that string_view is missing. Important: if we get this wrong, +// we will end up with two string_view definitions and potential trouble. +// That is why we work so hard above to avoid it. +// +#ifndef SIMDJSON_HAS_STRING_VIEW +SIMDJSON_PUSH_DISABLE_ALL_WARNINGS +#error #include "simdjson/nonstd/string_view.hpp" +SIMDJSON_POP_DISABLE_WARNINGS + +namespace std { + using string_view = nonstd::string_view; +} +#endif // SIMDJSON_HAS_STRING_VIEW +#undef SIMDJSON_HAS_STRING_VIEW // We are not going to need this macro anymore. + +/// If EXPR is an error, returns it. +#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } + +// Unless the programmer has already set SIMDJSON_DEVELOPMENT_CHECKS, +// we want to set it under debug builds. We detect a debug build +// under Visual Studio when the _DEBUG macro is set. Under the other +// compilers, we use the fact that they define __OPTIMIZE__ whenever +// they allow optimizations. +// It is possible that this could miss some cases where SIMDJSON_DEVELOPMENT_CHECKS +// is helpful, but the programmer can set the macro SIMDJSON_DEVELOPMENT_CHECKS. +// It could also wrongly set SIMDJSON_DEVELOPMENT_CHECKS (e.g., if the programmer +// sets _DEBUG in a release build under Visual Studio, or if some compiler fails to +// set the __OPTIMIZE__ macro). +#ifndef SIMDJSON_DEVELOPMENT_CHECKS +#ifdef _MSC_VER +// Visual Studio seems to set _DEBUG for debug builds. +#ifdef _DEBUG +#define SIMDJSON_DEVELOPMENT_CHECKS 1 +#endif // _DEBUG +#else // _MSC_VER +// All other compilers appear to set __OPTIMIZE__ to a positive integer +// when the compiler is optimizing. +#ifndef __OPTIMIZE__ +#define SIMDJSON_DEVELOPMENT_CHECKS 1 +#endif // __OPTIMIZE__ +#endif // _MSC_VER +#endif // SIMDJSON_DEVELOPMENT_CHECKS + +// The SIMDJSON_CHECK_EOF macro is a feature flag for the "don't require padding" +// feature. + +#if SIMDJSON_CPLUSPLUS17 +// if we have C++, then fallthrough is a default attribute +# define simdjson_fallthrough [[fallthrough]] +// check if we have __attribute__ support +#elif defined(__has_attribute) +// check if we have the __fallthrough__ attribute +#if __has_attribute(__fallthrough__) +// we are good to go: +# define simdjson_fallthrough __attribute__((__fallthrough__)) +#endif // __has_attribute(__fallthrough__) +#endif // SIMDJSON_CPLUSPLUS17 +// on some systems, we simply do not have support for fallthrough, so use a default: +#ifndef simdjson_fallthrough +# define simdjson_fallthrough do {} while (0) /* fallthrough */ +#endif // simdjson_fallthrough + +#if SIMDJSON_DEVELOPMENT_CHECKS +#define SIMDJSON_DEVELOPMENT_ASSERT(expr) do { assert ((expr)); } while (0) +#else +#define SIMDJSON_DEVELOPMENT_ASSERT(expr) do { } while (0) +#endif + +#ifndef SIMDJSON_UTF8VALIDATION +#define SIMDJSON_UTF8VALIDATION 1 +#endif + +#ifdef __has_include +// How do we detect that a compiler supports vbmi2? +// For sure if the following header is found, we are ok? +#if __has_include() +#define SIMDJSON_COMPILER_SUPPORTS_VBMI2 1 +#endif +#endif + +#ifdef _MSC_VER +#if _MSC_VER >= 1920 +// Visual Studio 2019 and up support VBMI2 under x64 even if the header +// avx512vbmi2intrin.h is not found. +#define SIMDJSON_COMPILER_SUPPORTS_VBMI2 1 +#endif +#endif + +// By default, we allow AVX512. +#ifndef SIMDJSON_AVX512_ALLOWED +#define SIMDJSON_AVX512_ALLOWED 1 +#endif + +#endif // SIMDJSON_COMMON_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/compiler_check.h b/contrib/libs/simdjson/include/simdjson/compiler_check.h new file mode 100644 index 000000000000..1d0d03d1545d --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/compiler_check.h @@ -0,0 +1,53 @@ +#ifndef SIMDJSON_COMPILER_CHECK_H +#define SIMDJSON_COMPILER_CHECK_H + +#ifndef __cplusplus +#error simdjson requires a C++ compiler +#endif + +#ifndef SIMDJSON_CPLUSPLUS +#if defined(_MSVC_LANG) && !defined(__clang__) +#define SIMDJSON_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG) +#else +#define SIMDJSON_CPLUSPLUS __cplusplus +#endif +#endif + +// C++ 23 +#if !defined(SIMDJSON_CPLUSPLUS23) && (SIMDJSON_CPLUSPLUS >= 202302L) +#define SIMDJSON_CPLUSPLUS23 1 +#endif + +// C++ 20 +#if !defined(SIMDJSON_CPLUSPLUS20) && (SIMDJSON_CPLUSPLUS >= 202002L) +#define SIMDJSON_CPLUSPLUS20 1 +#endif + +// C++ 17 +#if !defined(SIMDJSON_CPLUSPLUS17) && (SIMDJSON_CPLUSPLUS >= 201703L) +#define SIMDJSON_CPLUSPLUS17 1 +#endif + +// C++ 14 +#if !defined(SIMDJSON_CPLUSPLUS14) && (SIMDJSON_CPLUSPLUS >= 201402L) +#define SIMDJSON_CPLUSPLUS14 1 +#endif + +// C++ 11 +#if !defined(SIMDJSON_CPLUSPLUS11) && (SIMDJSON_CPLUSPLUS >= 201103L) +#define SIMDJSON_CPLUSPLUS11 1 +#endif + +#ifndef SIMDJSON_CPLUSPLUS11 +#error simdjson requires a compiler compliant with the C++11 standard +#endif + +#ifndef SIMDJSON_IF_CONSTEXPR +#if SIMDJSON_CPLUSPLUS17 +#define SIMDJSON_IF_CONSTEXPR if constexpr +#else +#define SIMDJSON_IF_CONSTEXPR if +#endif +#endif + +#endif // SIMDJSON_COMPILER_CHECK_H diff --git a/contrib/libs/simdjson/include/simdjson/dom.h b/contrib/libs/simdjson/include/simdjson/dom.h new file mode 100644 index 000000000000..bcf22ac22dc4 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom.h @@ -0,0 +1,23 @@ +#ifndef SIMDJSON_DOM_H +#define SIMDJSON_DOM_H + +#include "simdjson/dom/base.h" +#include "simdjson/dom/array.h" +#include "simdjson/dom/document_stream.h" +#include "simdjson/dom/document.h" +#include "simdjson/dom/element.h" +#include "simdjson/dom/object.h" +#include "simdjson/dom/parser.h" +#include "simdjson/dom/serialization.h" + +// Inline functions +#include "simdjson/dom/array-inl.h" +#include "simdjson/dom/document_stream-inl.h" +#include "simdjson/dom/document-inl.h" +#include "simdjson/dom/element-inl.h" +#include "simdjson/dom/object-inl.h" +#include "simdjson/dom/parser-inl.h" +#include "simdjson/internal/tape_ref-inl.h" +#include "simdjson/dom/serialization-inl.h" + +#endif // SIMDJSON_DOM_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/array-inl.h b/contrib/libs/simdjson/include/simdjson/dom/array-inl.h new file mode 100644 index 000000000000..6a29ef855321 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/array-inl.h @@ -0,0 +1,181 @@ +#ifndef SIMDJSON_ARRAY_INL_H +#define SIMDJSON_ARRAY_INL_H + +#include + +#include "simdjson/dom/base.h" +#include "simdjson/dom/array.h" +#include "simdjson/dom/element.h" +#include "simdjson/error-inl.h" +#include "simdjson/internal/tape_ref-inl.h" + +#include + +namespace simdjson { + +// +// simdjson_result inline implementation +// +simdjson_inline simdjson_result::simdjson_result() noexcept + : internal::simdjson_result_base() {} +simdjson_inline simdjson_result::simdjson_result(dom::array value) noexcept + : internal::simdjson_result_base(std::forward(value)) {} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : internal::simdjson_result_base(error) {} + +#if SIMDJSON_EXCEPTIONS + +inline dom::array::iterator simdjson_result::begin() const noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first.begin(); +} +inline dom::array::iterator simdjson_result::end() const noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first.end(); +} +inline size_t simdjson_result::size() const noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first.size(); +} + +#endif // SIMDJSON_EXCEPTIONS + +inline simdjson_result simdjson_result::at_pointer(std::string_view json_pointer) const noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} +inline simdjson_result simdjson_result::at(size_t index) const noexcept { + if (error()) { return error(); } + return first.at(index); +} + +namespace dom { + +// +// array inline implementation +// +simdjson_inline array::array() noexcept : tape{} {} +simdjson_inline array::array(const internal::tape_ref &_tape) noexcept : tape{_tape} {} +inline array::iterator array::begin() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + return internal::tape_ref(tape.doc, tape.json_index + 1); +} +inline array::iterator array::end() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + return internal::tape_ref(tape.doc, tape.after_element() - 1); +} +inline size_t array::size() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + return tape.scope_count(); +} +inline size_t array::number_of_slots() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + return tape.matching_brace_index() - tape.json_index; +} +inline simdjson_result array::at_pointer(std::string_view json_pointer) const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + if(json_pointer.empty()) { // an empty string means that we return the current node + return element(this->tape); // copy the current node + } else if(json_pointer[0] != '/') { // otherwise there is an error + return INVALID_JSON_POINTER; + } + json_pointer = json_pointer.substr(1); + // - means "the append position" or "the element after the end of the array" + // We don't support this, because we're returning a real element, not a position. + if (json_pointer == "-") { return INDEX_OUT_OF_BOUNDS; } + + // Read the array index + size_t array_index = 0; + size_t i; + for (i = 0; i < json_pointer.length() && json_pointer[i] != '/'; i++) { + uint8_t digit = uint8_t(json_pointer[i] - '0'); + // Check for non-digit in array index. If it's there, we're trying to get a field in an object + if (digit > 9) { return INCORRECT_TYPE; } + array_index = array_index*10 + digit; + } + + // 0 followed by other digits is invalid + if (i > 1 && json_pointer[0] == '0') { return INVALID_JSON_POINTER; } // "JSON pointer array index has other characters after 0" + + // Empty string is invalid; so is a "/" with no digits before it + if (i == 0) { return INVALID_JSON_POINTER; } // "Empty string in JSON pointer array index" + + // Get the child + auto child = array(tape).at(array_index); + // If there is an error, it ends here + if(child.error()) { + return child; + } + // If there is a /, we're not done yet, call recursively. + if (i < json_pointer.length()) { + child = child.at_pointer(json_pointer.substr(i)); + } + return child; +} + +inline simdjson_result array::at(size_t index) const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + size_t i=0; + for (auto element : *this) { + if (i == index) { return element; } + i++; + } + return INDEX_OUT_OF_BOUNDS; +} + +inline array::operator element() const noexcept { + return element(tape); +} + +// +// array::iterator inline implementation +// +simdjson_inline array::iterator::iterator(const internal::tape_ref &_tape) noexcept : tape{_tape} { } +inline element array::iterator::operator*() const noexcept { + return element(tape); +} +inline array::iterator& array::iterator::operator++() noexcept { + tape.json_index = tape.after_element(); + return *this; +} +inline array::iterator array::iterator::operator++(int) noexcept { + array::iterator out = *this; + ++*this; + return out; +} +inline bool array::iterator::operator!=(const array::iterator& other) const noexcept { + return tape.json_index != other.tape.json_index; +} +inline bool array::iterator::operator==(const array::iterator& other) const noexcept { + return tape.json_index == other.tape.json_index; +} +inline bool array::iterator::operator<(const array::iterator& other) const noexcept { + return tape.json_index < other.tape.json_index; +} +inline bool array::iterator::operator<=(const array::iterator& other) const noexcept { + return tape.json_index <= other.tape.json_index; +} +inline bool array::iterator::operator>=(const array::iterator& other) const noexcept { + return tape.json_index >= other.tape.json_index; +} +inline bool array::iterator::operator>(const array::iterator& other) const noexcept { + return tape.json_index > other.tape.json_index; +} + +} // namespace dom + + +} // namespace simdjson + +#include "simdjson/dom/element-inl.h" + +#if defined(__cpp_lib_ranges) +static_assert(std::ranges::view); +static_assert(std::ranges::sized_range); +#if SIMDJSON_EXCEPTIONS +static_assert(std::ranges::view>); +static_assert(std::ranges::sized_range>); +#endif // SIMDJSON_EXCEPTIONS +#endif // defined(__cpp_lib_ranges) + +#endif // SIMDJSON_ARRAY_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/array.h b/contrib/libs/simdjson/include/simdjson/dom/array.h new file mode 100644 index 000000000000..a90813a4504f --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/array.h @@ -0,0 +1,183 @@ +#ifndef SIMDJSON_DOM_ARRAY_H +#define SIMDJSON_DOM_ARRAY_H + +#include "simdjson/dom/base.h" +#include "simdjson/internal/tape_ref.h" + +namespace simdjson { +namespace dom { + +/** + * JSON array. + */ +class array { +public: + /** Create a new, invalid array */ + simdjson_inline array() noexcept; + + class iterator { + public: + using value_type = element; + using difference_type = std::ptrdiff_t; + using pointer = void; + using reference = value_type; + using iterator_category = std::forward_iterator_tag; + + /** + * Get the actual value + */ + inline reference operator*() const noexcept; + /** + * Get the next value. + * + * Part of the std::iterator interface. + */ + inline iterator& operator++() noexcept; + /** + * Get the next value. + * + * Part of the std::iterator interface. + */ + inline iterator operator++(int) noexcept; + /** + * Check if these values come from the same place in the JSON. + * + * Part of the std::iterator interface. + */ + inline bool operator!=(const iterator& other) const noexcept; + inline bool operator==(const iterator& other) const noexcept; + + inline bool operator<(const iterator& other) const noexcept; + inline bool operator<=(const iterator& other) const noexcept; + inline bool operator>=(const iterator& other) const noexcept; + inline bool operator>(const iterator& other) const noexcept; + + iterator() noexcept = default; + iterator(const iterator&) noexcept = default; + iterator& operator=(const iterator&) noexcept = default; + private: + simdjson_inline iterator(const internal::tape_ref &tape) noexcept; + internal::tape_ref tape; + friend class array; + }; + + /** + * Return the first array element. + * + * Part of the std::iterable interface. + */ + inline iterator begin() const noexcept; + /** + * One past the last array element. + * + * Part of the std::iterable interface. + */ + inline iterator end() const noexcept; + /** + * Get the size of the array (number of immediate children). + * It is a saturated value with a maximum of 0xFFFFFF: if the value + * is 0xFFFFFF then the size is 0xFFFFFF or greater. + */ + inline size_t size() const noexcept; + /** + * Get the total number of slots used by this array on the tape. + * + * Note that this is not the same thing as `size()`, which reports the + * number of actual elements within an array (not counting its children). + * + * Since an element can use 1 or 2 slots on the tape, you can only use this + * to figure out the total size of an array (including its children, + * recursively) if you know its structure ahead of time. + **/ + inline size_t number_of_slots() const noexcept; + /** + * Get the value associated with the given JSON pointer. We use the RFC 6901 + * https://tools.ietf.org/html/rfc6901 standard, interpreting the current node + * as the root of its own JSON document. + * + * dom::parser parser; + * array a = parser.parse(R"([ { "foo": { "a": [ 10, 20, 30 ] }} ])"_padded); + * a.at_pointer("/0/foo/a/1") == 20 + * a.at_pointer("0")["foo"]["a"].at(1) == 20 + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + */ + inline simdjson_result at_pointer(std::string_view json_pointer) const noexcept; + + /** + * Get the value at the given index. This function has linear-time complexity and + * is equivalent to the following: + * + * size_t i=0; + * for (auto element : *this) { + * if (i == index) { return element; } + * i++; + * } + * return INDEX_OUT_OF_BOUNDS; + * + * Avoid calling the at() function repeatedly. + * + * @return The value at the given index, or: + * - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length + */ + inline simdjson_result at(size_t index) const noexcept; + + /** + * Implicitly convert object to element + */ + inline operator element() const noexcept; + +private: + simdjson_inline array(const internal::tape_ref &tape) noexcept; + internal::tape_ref tape; + friend class element; + friend struct simdjson_result; + template + friend class simdjson::internal::string_builder; +}; + + +} // namespace dom + +/** The result of a JSON conversion that may fail. */ +template<> +struct simdjson_result : public internal::simdjson_result_base { +public: + simdjson_inline simdjson_result() noexcept; ///< @private + simdjson_inline simdjson_result(dom::array value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + + inline simdjson_result at_pointer(std::string_view json_pointer) const noexcept; + inline simdjson_result at(size_t index) const noexcept; + +#if SIMDJSON_EXCEPTIONS + inline dom::array::iterator begin() const noexcept(false); + inline dom::array::iterator end() const noexcept(false); + inline size_t size() const noexcept(false); +#endif // SIMDJSON_EXCEPTIONS +}; + + + +} // namespace simdjson + +#if defined(__cpp_lib_ranges) +#include + +namespace std { +namespace ranges { +template<> +inline constexpr bool enable_view = true; +#if SIMDJSON_EXCEPTIONS +template<> +inline constexpr bool enable_view> = true; +#endif // SIMDJSON_EXCEPTIONS +} // namespace ranges +} // namespace std +#endif // defined(__cpp_lib_ranges) + +#endif // SIMDJSON_DOM_ARRAY_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/base.h b/contrib/libs/simdjson/include/simdjson/dom/base.h new file mode 100644 index 000000000000..b862277a2113 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/base.h @@ -0,0 +1,54 @@ +#ifndef SIMDJSON_DOM_BASE_H +#define SIMDJSON_DOM_BASE_H + +#include "simdjson/base.h" + +namespace simdjson { + +/** + * @brief A DOM API on top of the simdjson parser. + */ +namespace dom { + +/** The default batch size for parser.parse_many() and parser.load_many() */ +static constexpr size_t DEFAULT_BATCH_SIZE = 1000000; +/** + * Some adversary might try to set the batch size to 0 or 1, which might cause problems. + * We set a minimum of 32B since anything else is highly likely to be an error. In practice, + * most users will want a much larger batch size. + * + * All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON + * document can ever span 0 or 1 byte and that very large values would create memory allocation issues. + */ +static constexpr size_t MINIMAL_BATCH_SIZE = 32; + +/** + * It is wasteful to allocate memory for tiny documents (e.g., 4 bytes). + */ +static constexpr size_t MINIMAL_DOCUMENT_CAPACITY = 32; + +class array; +class document; +class document_stream; +class element; +class key_value_pair; +class object; +class parser; + +#ifdef SIMDJSON_THREADS_ENABLED +struct stage1_worker; +#endif // SIMDJSON_THREADS_ENABLED + +} // namespace dom + +namespace internal { + +template +class string_builder; +class tape_ref; + +} // namespace internal + +} // namespace simdjson + +#endif // SIMDJSON_DOM_BASE_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/dom/document-inl.h b/contrib/libs/simdjson/include/simdjson/dom/document-inl.h new file mode 100644 index 000000000000..40d74b999e0b --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/document-inl.h @@ -0,0 +1,159 @@ +#ifndef SIMDJSON_DOCUMENT_INL_H +#define SIMDJSON_DOCUMENT_INL_H + +// Inline implementations go in here. + +#include "simdjson/dom/base.h" +#include "simdjson/dom/document.h" +#include "simdjson/dom/element-inl.h" +#include "simdjson/internal/tape_ref-inl.h" +#include "simdjson/internal/jsonformatutils.h" + +#include + +namespace simdjson { +namespace dom { + +// +// document inline implementation +// +inline element document::root() const noexcept { + return element(internal::tape_ref(this, 1)); +} +simdjson_warn_unused +inline size_t document::capacity() const noexcept { + return allocated_capacity; +} + +simdjson_warn_unused +inline error_code document::allocate(size_t capacity) noexcept { + if (capacity == 0) { + string_buf.reset(); + tape.reset(); + allocated_capacity = 0; + return SUCCESS; + } + + // a pathological input like "[[[[..." would generate capacity tape elements, so + // need a capacity of at least capacity + 1, but it is also possible to do + // worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6" + //where capacity + 1 tape elements are + // generated, see issue https://github.com/simdjson/simdjson/issues/345 + size_t tape_capacity = SIMDJSON_ROUNDUP_N(capacity + 3, 64); + // a document with only zero-length strings... could have capacity/3 string + // and we would need capacity/3 * 5 bytes on the string buffer + size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * capacity / 3 + SIMDJSON_PADDING, 64); + string_buf.reset( new (std::nothrow) uint8_t[string_capacity]); + tape.reset(new (std::nothrow) uint64_t[tape_capacity]); + if(!(string_buf && tape)) { + allocated_capacity = 0; + string_buf.reset(); + tape.reset(); + return MEMALLOC; + } + // Technically the allocated_capacity might be larger than capacity + // so the next line is pessimistic. + allocated_capacity = capacity; + return SUCCESS; +} + +inline bool document::dump_raw_tape(std::ostream &os) const noexcept { + uint32_t string_length; + size_t tape_idx = 0; + uint64_t tape_val = tape[tape_idx]; + uint8_t type = uint8_t(tape_val >> 56); + os << tape_idx << " : " << type; + tape_idx++; + size_t how_many = 0; + if (type == 'r') { + how_many = size_t(tape_val & internal::JSON_VALUE_MASK); + } else { + // Error: no starting root node? + return false; + } + os << "\t// pointing to " << how_many << " (right after last node)\n"; + uint64_t payload; + for (; tape_idx < how_many; tape_idx++) { + os << tape_idx << " : "; + tape_val = tape[tape_idx]; + payload = tape_val & internal::JSON_VALUE_MASK; + type = uint8_t(tape_val >> 56); + switch (type) { + case '"': // we have a string + os << "string \""; + std::memcpy(&string_length, string_buf.get() + payload, sizeof(uint32_t)); + os << internal::escape_json_string(std::string_view( + reinterpret_cast(string_buf.get() + payload + sizeof(uint32_t)), + string_length + )); + os << '"'; + os << '\n'; + break; + case 'l': // we have a long int + if (tape_idx + 1 >= how_many) { + return false; + } + os << "integer " << static_cast(tape[++tape_idx]) << "\n"; + break; + case 'u': // we have a long uint + if (tape_idx + 1 >= how_many) { + return false; + } + os << "unsigned integer " << tape[++tape_idx] << "\n"; + break; + case 'd': // we have a double + os << "float "; + if (tape_idx + 1 >= how_many) { + return false; + } + double answer; + std::memcpy(&answer, &tape[++tape_idx], sizeof(answer)); + os << answer << '\n'; + break; + case 'n': // we have a null + os << "null\n"; + break; + case 't': // we have a true + os << "true\n"; + break; + case 'f': // we have a false + os << "false\n"; + break; + case '{': // we have an object + os << "{\t// pointing to next tape location " << uint32_t(payload) + << " (first node after the scope), " + << " saturated count " + << ((payload >> 32) & internal::JSON_COUNT_MASK)<< "\n"; + break; case '}': // we end an object + os << "}\t// pointing to previous tape location " << uint32_t(payload) + << " (start of the scope)\n"; + break; + case '[': // we start an array + os << "[\t// pointing to next tape location " << uint32_t(payload) + << " (first node after the scope), " + << " saturated count " + << ((payload >> 32) & internal::JSON_COUNT_MASK)<< "\n"; + break; + case ']': // we end an array + os << "]\t// pointing to previous tape location " << uint32_t(payload) + << " (start of the scope)\n"; + break; + case 'r': // we start and end with the root node + // should we be hitting the root node? + return false; + default: + return false; + } + } + tape_val = tape[tape_idx]; + payload = tape_val & internal::JSON_VALUE_MASK; + type = uint8_t(tape_val >> 56); + os << tape_idx << " : " << type << "\t// pointing to " << payload + << " (start root)\n"; + return true; +} + +} // namespace dom +} // namespace simdjson + +#endif // SIMDJSON_DOCUMENT_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/document.h b/contrib/libs/simdjson/include/simdjson/dom/document.h new file mode 100644 index 000000000000..6c5e284bb47f --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/document.h @@ -0,0 +1,91 @@ +#ifndef SIMDJSON_DOM_DOCUMENT_H +#define SIMDJSON_DOM_DOCUMENT_H + +#include "simdjson/dom/base.h" + +#include + +namespace simdjson { +namespace dom { + +/** + * A parsed JSON document. + * + * This class cannot be copied, only moved, to avoid unintended allocations. + */ +class document { +public: + /** + * Create a document container with zero capacity. + * + * The parser will allocate capacity as needed. + */ + document() noexcept = default; + ~document() noexcept = default; + + /** + * Take another document's buffers. + * + * @param other The document to take. Its capacity is zeroed and it is invalidated. + */ + document(document &&other) noexcept = default; + /** @private */ + document(const document &) = delete; // Disallow copying + /** + * Take another document's buffers. + * + * @param other The document to take. Its capacity is zeroed. + */ + document &operator=(document &&other) noexcept = default; + /** @private */ + document &operator=(const document &) = delete; // Disallow copying + + /** + * Get the root element of this document as a JSON array. + */ + element root() const noexcept; + + /** + * @private Dump the raw tape for debugging. + * + * @param os the stream to output to. + * @return false if the tape is likely wrong (e.g., you did not parse a valid JSON). + */ + bool dump_raw_tape(std::ostream &os) const noexcept; + + /** @private Structural values. */ + std::unique_ptr tape{}; + + /** @private String values. + * + * Should be at least byte_capacity. + */ + std::unique_ptr string_buf{}; + /** @private Allocate memory to support + * input JSON documents of up to len bytes. + * + * When calling this function, you lose + * all the data. + * + * The memory allocation is strict: you + * can you use this function to increase + * or lower the amount of allocated memory. + * Passsing zero clears the memory. + */ + error_code allocate(size_t len) noexcept; + /** @private Capacity in bytes, in terms + * of how many bytes of input JSON we can + * support. + */ + size_t capacity() const noexcept; + + +private: + size_t allocated_capacity{0}; + friend class parser; +}; // class document + +} // namespace dom +} // namespace simdjson + +#endif // SIMDJSON_DOM_DOCUMENT_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/document_stream-inl.h b/contrib/libs/simdjson/include/simdjson/dom/document_stream-inl.h new file mode 100644 index 000000000000..b7062481f217 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/document_stream-inl.h @@ -0,0 +1,348 @@ +#ifndef SIMDJSON_DOCUMENT_STREAM_INL_H +#define SIMDJSON_DOCUMENT_STREAM_INL_H + +#include "simdjson/dom/base.h" +#include "simdjson/dom/document_stream.h" +#include "simdjson/dom/element-inl.h" +#include "simdjson/dom/parser-inl.h" +#include "simdjson/error-inl.h" +#include "simdjson/internal/dom_parser_implementation.h" + +namespace simdjson { +namespace dom { + +#ifdef SIMDJSON_THREADS_ENABLED + +inline void stage1_worker::finish() { + // After calling "run" someone would call finish() to wait + // for the end of the processing. + // This function will wait until either the thread has done + // the processing or, else, the destructor has been called. + std::unique_lock lock(locking_mutex); + cond_var.wait(lock, [this]{return has_work == false;}); +} + +inline stage1_worker::~stage1_worker() { + // The thread may never outlive the stage1_worker instance + // and will always be stopped/joined before the stage1_worker + // instance is gone. + stop_thread(); +} + +inline void stage1_worker::start_thread() { + std::unique_lock lock(locking_mutex); + if(thread.joinable()) { + return; // This should never happen but we never want to create more than one thread. + } + thread = std::thread([this]{ + while(true) { + std::unique_lock thread_lock(locking_mutex); + // We wait for either "run" or "stop_thread" to be called. + cond_var.wait(thread_lock, [this]{return has_work || !can_work;}); + // If, for some reason, the stop_thread() method was called (i.e., the + // destructor of stage1_worker is called, then we want to immediately destroy + // the thread (and not do any more processing). + if(!can_work) { + break; + } + this->owner->stage1_thread_error = this->owner->run_stage1(*this->stage1_thread_parser, + this->_next_batch_start); + this->has_work = false; + // The condition variable call should be moved after thread_lock.unlock() for performance + // reasons but thread sanitizers may report it as a data race if we do. + // See https://stackoverflow.com/questions/35775501/c-should-condition-variable-be-notified-under-lock + cond_var.notify_one(); // will notify "finish" + thread_lock.unlock(); + } + } + ); +} + + +inline void stage1_worker::stop_thread() { + std::unique_lock lock(locking_mutex); + // We have to make sure that all locks can be released. + can_work = false; + has_work = false; + cond_var.notify_all(); + lock.unlock(); + if(thread.joinable()) { + thread.join(); + } +} + +inline void stage1_worker::run(document_stream * ds, dom::parser * stage1, size_t next_batch_start) { + std::unique_lock lock(locking_mutex); + owner = ds; + _next_batch_start = next_batch_start; + stage1_thread_parser = stage1; + has_work = true; + // The condition variable call should be moved after thread_lock.unlock() for performance + // reasons but thread sanitizers may report it as a data race if we do. + // See https://stackoverflow.com/questions/35775501/c-should-condition-variable-be-notified-under-lock + cond_var.notify_one(); // will notify the thread lock that we have work + lock.unlock(); +} +#endif + +simdjson_inline document_stream::document_stream( + dom::parser &_parser, + const uint8_t *_buf, + size_t _len, + size_t _batch_size +) noexcept + : parser{&_parser}, + buf{_buf}, + len{_len}, + batch_size{_batch_size <= MINIMAL_BATCH_SIZE ? MINIMAL_BATCH_SIZE : _batch_size}, + error{SUCCESS} +#ifdef SIMDJSON_THREADS_ENABLED + , use_thread(_parser.threaded) // we need to make a copy because _parser.threaded can change +#endif +{ +#ifdef SIMDJSON_THREADS_ENABLED + if(worker.get() == nullptr) { + error = MEMALLOC; + } +#endif +} + +simdjson_inline document_stream::document_stream() noexcept + : parser{nullptr}, + buf{nullptr}, + len{0}, + batch_size{0}, + error{UNINITIALIZED} +#ifdef SIMDJSON_THREADS_ENABLED + , use_thread(false) +#endif +{ +} + +simdjson_inline document_stream::~document_stream() noexcept { +#ifdef SIMDJSON_THREADS_ENABLED + worker.reset(); +#endif +} + +simdjson_inline document_stream::iterator::iterator() noexcept + : stream{nullptr}, finished{true} { +} + +simdjson_inline document_stream::iterator document_stream::begin() noexcept { + start(); + // If there are no documents, we're finished. + return iterator(this, error == EMPTY); +} + +simdjson_inline document_stream::iterator document_stream::end() noexcept { + return iterator(this, true); +} + +simdjson_inline document_stream::iterator::iterator(document_stream* _stream, bool is_end) noexcept + : stream{_stream}, finished{is_end} { +} + +simdjson_inline document_stream::iterator::reference document_stream::iterator::operator*() noexcept { + // Note that in case of error, we do not yet mark + // the iterator as "finished": this detection is done + // in the operator++ function since it is possible + // to call operator++ repeatedly while omitting + // calls to operator*. + if (stream->error) { return stream->error; } + return stream->parser->doc.root(); +} + +simdjson_inline document_stream::iterator& document_stream::iterator::operator++() noexcept { + // If there is an error, then we want the iterator + // to be finished, no matter what. (E.g., we do not + // keep generating documents with errors, or go beyond + // a document with errors.) + // + // Users do not have to call "operator*()" when they use operator++, + // so we need to end the stream in the operator++ function. + // + // Note that setting finished = true is essential otherwise + // we would enter an infinite loop. + if (stream->error) { finished = true; } + // Note that stream->error() is guarded against error conditions + // (it will immediately return if stream->error casts to false). + // In effect, this next function does nothing when (stream->error) + // is true (hence the risk of an infinite loop). + stream->next(); + // If that was the last document, we're finished. + // It is the only type of error we do not want to appear + // in operator*. + if (stream->error == EMPTY) { finished = true; } + // If we had any other kind of error (not EMPTY) then we want + // to pass it along to the operator* and we cannot mark the result + // as "finished" just yet. + return *this; +} + +simdjson_inline bool document_stream::iterator::operator!=(const document_stream::iterator &other) const noexcept { + return finished != other.finished; +} + +inline void document_stream::start() noexcept { + if (error) { return; } + error = parser->ensure_capacity(batch_size); + if (error) { return; } + // Always run the first stage 1 parse immediately + batch_start = 0; + error = run_stage1(*parser, batch_start); + while(error == EMPTY) { + // In exceptional cases, we may start with an empty block + batch_start = next_batch_start(); + if (batch_start >= len) { return; } + error = run_stage1(*parser, batch_start); + } + if (error) { return; } +#ifdef SIMDJSON_THREADS_ENABLED + if (use_thread && next_batch_start() < len) { + // Kick off the first thread if needed + error = stage1_thread_parser.ensure_capacity(batch_size); + if (error) { return; } + worker->start_thread(); + start_stage1_thread(); + if (error) { return; } + } +#endif // SIMDJSON_THREADS_ENABLED + next(); +} + +simdjson_inline size_t document_stream::iterator::current_index() const noexcept { + return stream->doc_index; +} + +simdjson_inline std::string_view document_stream::iterator::source() const noexcept { + const char* start = reinterpret_cast(stream->buf) + current_index(); + bool object_or_array = ((*start == '[') || (*start == '{')); + if(object_or_array) { + size_t next_doc_index = stream->batch_start + stream->parser->implementation->structural_indexes[stream->parser->implementation->next_structural_index - 1]; + return std::string_view(start, next_doc_index - current_index() + 1); + } else { + size_t next_doc_index = stream->batch_start + stream->parser->implementation->structural_indexes[stream->parser->implementation->next_structural_index]; + size_t svlen = next_doc_index - current_index(); + while(svlen > 1 && (std::isspace(start[svlen-1]) || start[svlen-1] == '\0')) { + svlen--; + } + return std::string_view(start, svlen); + } +} + + +inline void document_stream::next() noexcept { + // We always exit at once, once in an error condition. + if (error) { return; } + + // Load the next document from the batch + doc_index = batch_start + parser->implementation->structural_indexes[parser->implementation->next_structural_index]; + error = parser->implementation->stage2_next(parser->doc); + // If that was the last document in the batch, load another batch (if available) + while (error == EMPTY) { + batch_start = next_batch_start(); + if (batch_start >= len) { break; } + +#ifdef SIMDJSON_THREADS_ENABLED + if(use_thread) { + load_from_stage1_thread(); + } else { + error = run_stage1(*parser, batch_start); + } +#else + error = run_stage1(*parser, batch_start); +#endif + if (error) { continue; } // If the error was EMPTY, we may want to load another batch. + // Run stage 2 on the first document in the batch + doc_index = batch_start + parser->implementation->structural_indexes[parser->implementation->next_structural_index]; + error = parser->implementation->stage2_next(parser->doc); + } +} +inline size_t document_stream::size_in_bytes() const noexcept { + return len; +} + +inline size_t document_stream::truncated_bytes() const noexcept { + if(error == CAPACITY) { return len - batch_start; } + return parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] - parser->implementation->structural_indexes[parser->implementation->n_structural_indexes + 1]; +} + +inline size_t document_stream::next_batch_start() const noexcept { + return batch_start + parser->implementation->structural_indexes[parser->implementation->n_structural_indexes]; +} + +inline error_code document_stream::run_stage1(dom::parser &p, size_t _batch_start) noexcept { + size_t remaining = len - _batch_start; + if (remaining <= batch_size) { + return p.implementation->stage1(&buf[_batch_start], remaining, stage1_mode::streaming_final); + } else { + return p.implementation->stage1(&buf[_batch_start], batch_size, stage1_mode::streaming_partial); + } +} + +#ifdef SIMDJSON_THREADS_ENABLED + +inline void document_stream::load_from_stage1_thread() noexcept { + worker->finish(); + // Swap to the parser that was loaded up in the thread. Make sure the parser has + // enough memory to swap to, as well. + std::swap(*parser, stage1_thread_parser); + error = stage1_thread_error; + if (error) { return; } + + // If there's anything left, start the stage 1 thread! + if (next_batch_start() < len) { + start_stage1_thread(); + } +} + +inline void document_stream::start_stage1_thread() noexcept { + // we call the thread on a lambda that will update + // this->stage1_thread_error + // there is only one thread that may write to this value + // TODO this is NOT exception-safe. + this->stage1_thread_error = UNINITIALIZED; // In case something goes wrong, make sure it's an error + size_t _next_batch_start = this->next_batch_start(); + + worker->run(this, & this->stage1_thread_parser, _next_batch_start); +} + +#endif // SIMDJSON_THREADS_ENABLED + +} // namespace dom + +simdjson_inline simdjson_result::simdjson_result() noexcept + : simdjson_result_base() { +} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : simdjson_result_base(error) { +} +simdjson_inline simdjson_result::simdjson_result(dom::document_stream &&value) noexcept + : simdjson_result_base(std::forward(value)) { +} + +#if SIMDJSON_EXCEPTIONS +simdjson_inline dom::document_stream::iterator simdjson_result::begin() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first.begin(); +} +simdjson_inline dom::document_stream::iterator simdjson_result::end() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first.end(); +} +#else // SIMDJSON_EXCEPTIONS +#ifndef SIMDJSON_DISABLE_DEPRECATED_API +simdjson_inline dom::document_stream::iterator simdjson_result::begin() noexcept { + first.error = error(); + return first.begin(); +} +simdjson_inline dom::document_stream::iterator simdjson_result::end() noexcept { + first.error = error(); + return first.end(); +} +#endif // SIMDJSON_DISABLE_DEPRECATED_API +#endif // SIMDJSON_EXCEPTIONS + +} // namespace simdjson +#endif // SIMDJSON_DOCUMENT_STREAM_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/document_stream.h b/contrib/libs/simdjson/include/simdjson/dom/document_stream.h new file mode 100644 index 000000000000..308f20e25ae0 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/document_stream.h @@ -0,0 +1,322 @@ +#ifndef SIMDJSON_DOCUMENT_STREAM_H +#define SIMDJSON_DOCUMENT_STREAM_H + +#include "simdjson/dom/base.h" +#include "simdjson/dom/parser.h" + +#ifdef SIMDJSON_THREADS_ENABLED +#include +#include +#include +#endif + +namespace simdjson { +namespace dom { + +#ifdef SIMDJSON_THREADS_ENABLED +/** @private Custom worker class **/ +struct stage1_worker { + stage1_worker() noexcept = default; + stage1_worker(const stage1_worker&) = delete; + stage1_worker(stage1_worker&&) = delete; + stage1_worker operator=(const stage1_worker&) = delete; + ~stage1_worker(); + /** + * We only start the thread when it is needed, not at object construction, this may throw. + * You should only call this once. + **/ + void start_thread(); + /** + * Start a stage 1 job. You should first call 'run', then 'finish'. + * You must call start_thread once before. + */ + void run(document_stream * ds, dom::parser * stage1, size_t next_batch_start); + /** Wait for the run to finish (blocking). You should first call 'run', then 'finish'. **/ + void finish(); + +private: + + /** + * Normally, we would never stop the thread. But we do in the destructor. + * This function is only safe assuming that you are not waiting for results. You + * should have called run, then finish, and be done. + **/ + void stop_thread(); + + std::thread thread{}; + /** These three variables define the work done by the thread. **/ + dom::parser * stage1_thread_parser{}; + size_t _next_batch_start{}; + document_stream * owner{}; + /** + * We have two state variables. This could be streamlined to one variable in the future but + * we use two for clarity. + */ + bool has_work{false}; + bool can_work{true}; + + /** + * We lock using a mutex. + */ + std::mutex locking_mutex{}; + std::condition_variable cond_var{}; +}; +#endif + +/** + * A forward-only stream of documents. + * + * Produced by parser::parse_many. + * + */ +class document_stream { +public: + /** + * Construct an uninitialized document_stream. + * + * ```c++ + * document_stream docs; + * error = parser.parse_many(json).get(docs); + * ``` + */ + simdjson_inline document_stream() noexcept; + /** Move one document_stream to another. */ + simdjson_inline document_stream(document_stream &&other) noexcept = default; + /** Move one document_stream to another. */ + simdjson_inline document_stream &operator=(document_stream &&other) noexcept = default; + + simdjson_inline ~document_stream() noexcept; + /** + * Returns the input size in bytes. + */ + inline size_t size_in_bytes() const noexcept; + /** + * After iterating through the stream, this method + * returns the number of bytes that were not parsed at the end + * of the stream. If truncated_bytes() differs from zero, + * then the input was truncated maybe because incomplete JSON + * documents were found at the end of the stream. You + * may need to process the bytes in the interval [size_in_bytes()-truncated_bytes(), size_in_bytes()). + * + * You should only call truncated_bytes() after streaming through all + * documents, like so: + * + * document_stream stream = parser.parse_many(json,window); + * for(auto doc : stream) { + * // do something with doc + * } + * size_t truncated = stream.truncated_bytes(); + * + */ + inline size_t truncated_bytes() const noexcept; + /** + * An iterator through a forward-only stream of documents. + */ + class iterator { + public: + using value_type = simdjson_result; + using reference = value_type; + + using difference_type = std::ptrdiff_t; + + using iterator_category = std::input_iterator_tag; + + /** + * Default constructor. + */ + simdjson_inline iterator() noexcept; + /** + * Get the current document (or error). + */ + simdjson_inline reference operator*() noexcept; + /** + * Advance to the next document (prefix). + */ + inline iterator& operator++() noexcept; + /** + * Check if we're at the end yet. + * @param other the end iterator to compare to. + */ + simdjson_inline bool operator!=(const iterator &other) const noexcept; + /** + * @private + * + * Gives the current index in the input document in bytes. + * + * document_stream stream = parser.parse_many(json,window); + * for(auto i = stream.begin(); i != stream.end(); ++i) { + * auto doc = *i; + * size_t index = i.current_index(); + * } + * + * This function (current_index()) is experimental and the usage + * may change in future versions of simdjson: we find the API somewhat + * awkward and we would like to offer something friendlier. + */ + simdjson_inline size_t current_index() const noexcept; + /** + * @private + * + * Gives a view of the current document. + * + * document_stream stream = parser.parse_many(json,window); + * for(auto i = stream.begin(); i != stream.end(); ++i) { + * auto doc = *i; + * std::string_view v = i->source(); + * } + * + * The returned string_view instance is simply a map to the (unparsed) + * source string: it may thus include white-space characters and all manner + * of padding. + * + * This function (source()) is experimental and the usage + * may change in future versions of simdjson: we find the API somewhat + * awkward and we would like to offer something friendlier. + */ + simdjson_inline std::string_view source() const noexcept; + + private: + simdjson_inline iterator(document_stream *s, bool finished) noexcept; + /** The document_stream we're iterating through. */ + document_stream* stream; + /** Whether we're finished or not. */ + bool finished; + friend class document_stream; + }; + + /** + * Start iterating the documents in the stream. + */ + simdjson_inline iterator begin() noexcept; + /** + * The end of the stream, for iterator comparison purposes. + */ + simdjson_inline iterator end() noexcept; + +private: + + document_stream &operator=(const document_stream &) = delete; // Disallow copying + document_stream(const document_stream &other) = delete; // Disallow copying + + /** + * Construct a document_stream. Does not allocate or parse anything until the iterator is + * used. + * + * @param parser is a reference to the parser instance used to generate this document_stream + * @param buf is the raw byte buffer we need to process + * @param len is the length of the raw byte buffer in bytes + * @param batch_size is the size of the windows (must be strictly greater or equal to the largest JSON document) + */ + simdjson_inline document_stream( + dom::parser &parser, + const uint8_t *buf, + size_t len, + size_t batch_size + ) noexcept; + + /** + * Parse the first document in the buffer. Used by begin(), to handle allocation and + * initialization. + */ + inline void start() noexcept; + + /** + * Parse the next document found in the buffer previously given to document_stream. + * + * The content should be a valid JSON document encoded as UTF-8. If there is a + * UTF-8 BOM, the parser skips it. + * + * You do NOT need to pre-allocate a parser. This function takes care of + * pre-allocating a capacity defined by the batch_size defined when creating the + * document_stream object. + * + * The function returns simdjson::EMPTY if there is no more data to be parsed. + * + * The function returns simdjson::SUCCESS (as integer = 0) in case of success + * and indicates that the buffer has successfully been parsed to the end. + * Every document it contained has been parsed without error. + * + * The function returns an error code from simdjson/simdjson.h in case of failure + * such as simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth; + * the simdjson::error_message function converts these error codes into a string). + * + * You can also check validity by calling parser.is_valid(). The same parser can + * and should be reused for the other documents in the buffer. + */ + inline void next() noexcept; + + /** + * Pass the next batch through stage 1 and return when finished. + * When threads are enabled, this may wait for the stage 1 thread to finish. + */ + inline void load_batch() noexcept; + + /** Get the next document index. */ + inline size_t next_batch_start() const noexcept; + + /** Pass the next batch through stage 1 with the given parser. */ + inline error_code run_stage1(dom::parser &p, size_t batch_start) noexcept; + + dom::parser *parser; + const uint8_t *buf; + size_t len; + size_t batch_size; + /** The error (or lack thereof) from the current document. */ + error_code error; + size_t batch_start{0}; + size_t doc_index{}; +#ifdef SIMDJSON_THREADS_ENABLED + /** Indicates whether we use threads. Note that this needs to be a constant during the execution of the parsing. */ + bool use_thread; + + inline void load_from_stage1_thread() noexcept; + + /** Start a thread to run stage 1 on the next batch. */ + inline void start_stage1_thread() noexcept; + + /** Wait for the stage 1 thread to finish and capture the results. */ + inline void finish_stage1_thread() noexcept; + + /** The error returned from the stage 1 thread. */ + error_code stage1_thread_error{UNINITIALIZED}; + /** The thread used to run stage 1 against the next batch in the background. */ + friend struct stage1_worker; + std::unique_ptr worker{new(std::nothrow) stage1_worker()}; + /** + * The parser used to run stage 1 in the background. Will be swapped + * with the regular parser when finished. + */ + dom::parser stage1_thread_parser{}; +#endif // SIMDJSON_THREADS_ENABLED + + friend class dom::parser; + friend struct simdjson_result; + friend struct internal::simdjson_result_base; + +}; // class document_stream + +} // namespace dom + +template<> +struct simdjson_result : public internal::simdjson_result_base { +public: + simdjson_inline simdjson_result() noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result(dom::document_stream &&value) noexcept; ///< @private + +#if SIMDJSON_EXCEPTIONS + simdjson_inline dom::document_stream::iterator begin() noexcept(false); + simdjson_inline dom::document_stream::iterator end() noexcept(false); +#else // SIMDJSON_EXCEPTIONS +#ifndef SIMDJSON_DISABLE_DEPRECATED_API + [[deprecated("parse_many() and load_many() may return errors. Use document_stream stream; error = parser.parse_many().get(doc); instead.")]] + simdjson_inline dom::document_stream::iterator begin() noexcept; + [[deprecated("parse_many() and load_many() may return errors. Use document_stream stream; error = parser.parse_many().get(doc); instead.")]] + simdjson_inline dom::document_stream::iterator end() noexcept; +#endif // SIMDJSON_DISABLE_DEPRECATED_API +#endif // SIMDJSON_EXCEPTIONS +}; // struct simdjson_result + +} // namespace simdjson + +#endif // SIMDJSON_DOCUMENT_STREAM_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/element-inl.h b/contrib/libs/simdjson/include/simdjson/dom/element-inl.h new file mode 100644 index 000000000000..cab313beb901 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/element-inl.h @@ -0,0 +1,473 @@ +#ifndef SIMDJSON_ELEMENT_INL_H +#define SIMDJSON_ELEMENT_INL_H + +#include "simdjson/dom/base.h" +#include "simdjson/dom/element.h" +#include "simdjson/dom/document.h" +#include "simdjson/dom/object.h" +#include "simdjson/internal/tape_type.h" + +#include "simdjson/dom/object-inl.h" +#include "simdjson/error-inl.h" + +#include +#include + +namespace simdjson { + +// +// simdjson_result inline implementation +// +simdjson_inline simdjson_result::simdjson_result() noexcept + : internal::simdjson_result_base() {} +simdjson_inline simdjson_result::simdjson_result(dom::element &&value) noexcept + : internal::simdjson_result_base(std::forward(value)) {} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : internal::simdjson_result_base(error) {} +inline simdjson_result simdjson_result::type() const noexcept { + if (error()) { return error(); } + return first.type(); +} + +template +simdjson_inline bool simdjson_result::is() const noexcept { + return !error() && first.is(); +} +template +simdjson_inline simdjson_result simdjson_result::get() const noexcept { + if (error()) { return error(); } + return first.get(); +} +template +simdjson_warn_unused simdjson_inline error_code simdjson_result::get(T &value) const noexcept { + if (error()) { return error(); } + return first.get(value); +} + +simdjson_inline simdjson_result simdjson_result::get_array() const noexcept { + if (error()) { return error(); } + return first.get_array(); +} +simdjson_inline simdjson_result simdjson_result::get_object() const noexcept { + if (error()) { return error(); } + return first.get_object(); +} +simdjson_inline simdjson_result simdjson_result::get_c_str() const noexcept { + if (error()) { return error(); } + return first.get_c_str(); +} +simdjson_inline simdjson_result simdjson_result::get_string_length() const noexcept { + if (error()) { return error(); } + return first.get_string_length(); +} +simdjson_inline simdjson_result simdjson_result::get_string() const noexcept { + if (error()) { return error(); } + return first.get_string(); +} +simdjson_inline simdjson_result simdjson_result::get_int64() const noexcept { + if (error()) { return error(); } + return first.get_int64(); +} +simdjson_inline simdjson_result simdjson_result::get_uint64() const noexcept { + if (error()) { return error(); } + return first.get_uint64(); +} +simdjson_inline simdjson_result simdjson_result::get_double() const noexcept { + if (error()) { return error(); } + return first.get_double(); +} +simdjson_inline simdjson_result simdjson_result::get_bool() const noexcept { + if (error()) { return error(); } + return first.get_bool(); +} + +simdjson_inline bool simdjson_result::is_array() const noexcept { + return !error() && first.is_array(); +} +simdjson_inline bool simdjson_result::is_object() const noexcept { + return !error() && first.is_object(); +} +simdjson_inline bool simdjson_result::is_string() const noexcept { + return !error() && first.is_string(); +} +simdjson_inline bool simdjson_result::is_int64() const noexcept { + return !error() && first.is_int64(); +} +simdjson_inline bool simdjson_result::is_uint64() const noexcept { + return !error() && first.is_uint64(); +} +simdjson_inline bool simdjson_result::is_double() const noexcept { + return !error() && first.is_double(); +} +simdjson_inline bool simdjson_result::is_number() const noexcept { + return !error() && first.is_number(); +} +simdjson_inline bool simdjson_result::is_bool() const noexcept { + return !error() && first.is_bool(); +} + +simdjson_inline bool simdjson_result::is_null() const noexcept { + return !error() && first.is_null(); +} + +simdjson_inline simdjson_result simdjson_result::operator[](std::string_view key) const noexcept { + if (error()) { return error(); } + return first[key]; +} +simdjson_inline simdjson_result simdjson_result::operator[](const char *key) const noexcept { + if (error()) { return error(); } + return first[key]; +} +simdjson_inline simdjson_result simdjson_result::at_pointer(const std::string_view json_pointer) const noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} +#ifndef SIMDJSON_DISABLE_DEPRECATED_API +[[deprecated("For standard compliance, use at_pointer instead, and prefix your pointers with a slash '/', see RFC6901 ")]] +simdjson_inline simdjson_result simdjson_result::at(const std::string_view json_pointer) const noexcept { +SIMDJSON_PUSH_DISABLE_WARNINGS +SIMDJSON_DISABLE_DEPRECATED_WARNING + if (error()) { return error(); } + return first.at(json_pointer); +SIMDJSON_POP_DISABLE_WARNINGS +} +#endif // SIMDJSON_DISABLE_DEPRECATED_API +simdjson_inline simdjson_result simdjson_result::at(size_t index) const noexcept { + if (error()) { return error(); } + return first.at(index); +} +simdjson_inline simdjson_result simdjson_result::at_key(std::string_view key) const noexcept { + if (error()) { return error(); } + return first.at_key(key); +} +simdjson_inline simdjson_result simdjson_result::at_key_case_insensitive(std::string_view key) const noexcept { + if (error()) { return error(); } + return first.at_key_case_insensitive(key); +} + +#if SIMDJSON_EXCEPTIONS + +simdjson_inline simdjson_result::operator bool() const noexcept(false) { + return get(); +} +simdjson_inline simdjson_result::operator const char *() const noexcept(false) { + return get(); +} +simdjson_inline simdjson_result::operator std::string_view() const noexcept(false) { + return get(); +} +simdjson_inline simdjson_result::operator uint64_t() const noexcept(false) { + return get(); +} +simdjson_inline simdjson_result::operator int64_t() const noexcept(false) { + return get(); +} +simdjson_inline simdjson_result::operator double() const noexcept(false) { + return get(); +} +simdjson_inline simdjson_result::operator dom::array() const noexcept(false) { + return get(); +} +simdjson_inline simdjson_result::operator dom::object() const noexcept(false) { + return get(); +} + +simdjson_inline dom::array::iterator simdjson_result::begin() const noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first.begin(); +} +simdjson_inline dom::array::iterator simdjson_result::end() const noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first.end(); +} + +#endif // SIMDJSON_EXCEPTIONS + +namespace dom { + +// +// element inline implementation +// +simdjson_inline element::element() noexcept : tape{} {} +simdjson_inline element::element(const internal::tape_ref &_tape) noexcept : tape{_tape} { } + +inline element_type element::type() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + auto tape_type = tape.tape_ref_type(); + return tape_type == internal::tape_type::FALSE_VALUE ? element_type::BOOL : static_cast(tape_type); +} + +inline simdjson_result element::get_bool() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + if(tape.is_true()) { + return true; + } else if(tape.is_false()) { + return false; + } + return INCORRECT_TYPE; +} +inline simdjson_result element::get_c_str() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + switch (tape.tape_ref_type()) { + case internal::tape_type::STRING: { + return tape.get_c_str(); + } + default: + return INCORRECT_TYPE; + } +} +inline simdjson_result element::get_string_length() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + switch (tape.tape_ref_type()) { + case internal::tape_type::STRING: { + return tape.get_string_length(); + } + default: + return INCORRECT_TYPE; + } +} +inline simdjson_result element::get_string() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + switch (tape.tape_ref_type()) { + case internal::tape_type::STRING: + return tape.get_string_view(); + default: + return INCORRECT_TYPE; + } +} +inline simdjson_result element::get_uint64() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + if(simdjson_unlikely(!tape.is_uint64())) { // branch rarely taken + if(tape.is_int64()) { + int64_t result = tape.next_tape_value(); + if (result < 0) { + return NUMBER_OUT_OF_RANGE; + } + return uint64_t(result); + } + return INCORRECT_TYPE; + } + return tape.next_tape_value(); +} +inline simdjson_result element::get_int64() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + if(simdjson_unlikely(!tape.is_int64())) { // branch rarely taken + if(tape.is_uint64()) { + uint64_t result = tape.next_tape_value(); + // Wrapping max in parens to handle Windows issue: https://stackoverflow.com/questions/11544073/how-do-i-deal-with-the-max-macro-in-windows-h-colliding-with-max-in-std + if (result > uint64_t((std::numeric_limits::max)())) { + return NUMBER_OUT_OF_RANGE; + } + return static_cast(result); + } + return INCORRECT_TYPE; + } + return tape.next_tape_value(); +} +inline simdjson_result element::get_double() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + // Performance considerations: + // 1. Querying tape_ref_type() implies doing a shift, it is fast to just do a straight + // comparison. + // 2. Using a switch-case relies on the compiler guessing what kind of code generation + // we want... But the compiler cannot know that we expect the type to be "double" + // most of the time. + // We can expect get to refer to a double type almost all the time. + // It is important to craft the code accordingly so that the compiler can use this + // information. (This could also be solved with profile-guided optimization.) + if(simdjson_unlikely(!tape.is_double())) { // branch rarely taken + if(tape.is_uint64()) { + return double(tape.next_tape_value()); + } else if(tape.is_int64()) { + return double(tape.next_tape_value()); + } + return INCORRECT_TYPE; + } + // this is common: + return tape.next_tape_value(); +} +inline simdjson_result element::get_array() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + switch (tape.tape_ref_type()) { + case internal::tape_type::START_ARRAY: + return array(tape); + default: + return INCORRECT_TYPE; + } +} +inline simdjson_result element::get_object() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + switch (tape.tape_ref_type()) { + case internal::tape_type::START_OBJECT: + return object(tape); + default: + return INCORRECT_TYPE; + } +} + +template +simdjson_warn_unused simdjson_inline error_code element::get(T &value) const noexcept { + return get().get(value); +} +// An element-specific version prevents recursion with simdjson_result::get(value) +template<> +simdjson_warn_unused simdjson_inline error_code element::get(element &value) const noexcept { + value = element(tape); + return SUCCESS; +} +template +inline void element::tie(T &value, error_code &error) && noexcept { + error = get(value); +} + +template +simdjson_inline bool element::is() const noexcept { + auto result = get(); + return !result.error(); +} + +template<> inline simdjson_result element::get() const noexcept { return get_array(); } +template<> inline simdjson_result element::get() const noexcept { return get_object(); } +template<> inline simdjson_result element::get() const noexcept { return get_c_str(); } +template<> inline simdjson_result element::get() const noexcept { return get_string(); } +template<> inline simdjson_result element::get() const noexcept { return get_int64(); } +template<> inline simdjson_result element::get() const noexcept { return get_uint64(); } +template<> inline simdjson_result element::get() const noexcept { return get_double(); } +template<> inline simdjson_result element::get() const noexcept { return get_bool(); } + +inline bool element::is_array() const noexcept { return is(); } +inline bool element::is_object() const noexcept { return is(); } +inline bool element::is_string() const noexcept { return is(); } +inline bool element::is_int64() const noexcept { return is(); } +inline bool element::is_uint64() const noexcept { return is(); } +inline bool element::is_double() const noexcept { return is(); } +inline bool element::is_bool() const noexcept { return is(); } +inline bool element::is_number() const noexcept { return is_int64() || is_uint64() || is_double(); } + +inline bool element::is_null() const noexcept { + return tape.is_null_on_tape(); +} + +#if SIMDJSON_EXCEPTIONS + +inline element::operator bool() const noexcept(false) { return get(); } +inline element::operator const char*() const noexcept(false) { return get(); } +inline element::operator std::string_view() const noexcept(false) { return get(); } +inline element::operator uint64_t() const noexcept(false) { return get(); } +inline element::operator int64_t() const noexcept(false) { return get(); } +inline element::operator double() const noexcept(false) { return get(); } +inline element::operator array() const noexcept(false) { return get(); } +inline element::operator object() const noexcept(false) { return get(); } + +inline array::iterator element::begin() const noexcept(false) { + return get().begin(); +} +inline array::iterator element::end() const noexcept(false) { + return get().end(); +} + +#endif // SIMDJSON_EXCEPTIONS + +inline simdjson_result element::operator[](std::string_view key) const noexcept { + return at_key(key); +} +inline simdjson_result element::operator[](const char *key) const noexcept { + return at_key(key); +} + +inline bool is_pointer_well_formed(std::string_view json_pointer) noexcept { + if (simdjson_unlikely(json_pointer[0] != '/')) { + return false; + } + size_t escape = json_pointer.find('~'); + if (escape == std::string_view::npos) { + return true; + } + if (escape == json_pointer.size() - 1) { + return false; + } + if (json_pointer[escape + 1] != '0' && json_pointer[escape + 1] != '1') { + return false; + } + return true; +} + +inline simdjson_result element::at_pointer(std::string_view json_pointer) const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + switch (tape.tape_ref_type()) { + case internal::tape_type::START_OBJECT: + return object(tape).at_pointer(json_pointer); + case internal::tape_type::START_ARRAY: + return array(tape).at_pointer(json_pointer); + default: { + if (!json_pointer.empty()) { // a non-empty string can be invalid, or accessing a primitive (issue 2154) + if (is_pointer_well_formed(json_pointer)) { + return NO_SUCH_FIELD; + } + return INVALID_JSON_POINTER; + } + // an empty string means that we return the current node + dom::element copy(*this); + return simdjson_result(std::move(copy)); + } + } +} +#ifndef SIMDJSON_DISABLE_DEPRECATED_API +[[deprecated("For standard compliance, use at_pointer instead, and prefix your pointers with a slash '/', see RFC6901 ")]] +inline simdjson_result element::at(std::string_view json_pointer) const noexcept { + // version 0.4 of simdjson allowed non-compliant pointers + auto std_pointer = (json_pointer.empty() ? "" : "/") + std::string(json_pointer.begin(), json_pointer.end()); + return at_pointer(std_pointer); +} +#endif // SIMDJSON_DISABLE_DEPRECATED_API + +inline simdjson_result element::at(size_t index) const noexcept { + return get().at(index); +} +inline simdjson_result element::at_key(std::string_view key) const noexcept { + return get().at_key(key); +} +inline simdjson_result element::at_key_case_insensitive(std::string_view key) const noexcept { + return get().at_key_case_insensitive(key); +} +inline bool element::operator<(const element &other) const noexcept { + return tape.json_index < other.tape.json_index; +} +inline bool element::operator==(const element &other) const noexcept { + return tape.json_index == other.tape.json_index; +} + +inline bool element::dump_raw_tape(std::ostream &out) const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + return tape.doc->dump_raw_tape(out); +} + + +inline std::ostream& operator<<(std::ostream& out, element_type type) { + switch (type) { + case element_type::ARRAY: + return out << "array"; + case element_type::OBJECT: + return out << "object"; + case element_type::INT64: + return out << "int64_t"; + case element_type::UINT64: + return out << "uint64_t"; + case element_type::DOUBLE: + return out << "double"; + case element_type::STRING: + return out << "string"; + case element_type::BOOL: + return out << "bool"; + case element_type::NULL_VALUE: + return out << "null"; + default: + return out << "unexpected content!!!"; // abort() usage is forbidden in the library + } +} + +} // namespace dom + +} // namespace simdjson + +#endif // SIMDJSON_ELEMENT_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/element.h b/contrib/libs/simdjson/include/simdjson/dom/element.h new file mode 100644 index 000000000000..732b89e41ac1 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/element.h @@ -0,0 +1,552 @@ +#ifndef SIMDJSON_DOM_ELEMENT_H +#define SIMDJSON_DOM_ELEMENT_H + +#include "simdjson/dom/base.h" +#include "simdjson/dom/array.h" + +namespace simdjson { +namespace dom { + +/** + * The actual concrete type of a JSON element + * This is the type it is most easily cast to with get<>. + */ +enum class element_type { + ARRAY = '[', ///< dom::array + OBJECT = '{', ///< dom::object + INT64 = 'l', ///< int64_t + UINT64 = 'u', ///< uint64_t: any integer that fits in uint64_t but *not* int64_t + DOUBLE = 'd', ///< double: Any number with a "." or "e" that fits in double. + STRING = '"', ///< std::string_view + BOOL = 't', ///< bool + NULL_VALUE = 'n' ///< null +}; + +/** + * A JSON element. + * + * References an element in a JSON document, representing a JSON null, boolean, string, number, + * array or object. + */ +class element { +public: + /** Create a new, invalid element. */ + simdjson_inline element() noexcept; + + /** The type of this element. */ + simdjson_inline element_type type() const noexcept; + + /** + * Cast this element to an array. + * + * @returns An object that can be used to iterate the array, or: + * INCORRECT_TYPE if the JSON element is not an array. + */ + inline simdjson_result get_array() const noexcept; + /** + * Cast this element to an object. + * + * @returns An object that can be used to look up or iterate the object's fields, or: + * INCORRECT_TYPE if the JSON element is not an object. + */ + inline simdjson_result get_object() const noexcept; + /** + * Cast this element to a null-terminated C string. + * + * The string is guaranteed to be valid UTF-8. + * + * The length of the string is given by get_string_length(). Because JSON strings + * may contain null characters, it may be incorrect to use strlen to determine the + * string length. + * + * It is possible to get a single string_view instance which represents both the string + * content and its length: see get_string(). + * + * @returns A pointer to a null-terminated UTF-8 string. This string is stored in the parser and will + * be invalidated the next time it parses a document or when it is destroyed. + * Returns INCORRECT_TYPE if the JSON element is not a string. + */ + inline simdjson_result get_c_str() const noexcept; + /** + * Gives the length in bytes of the string. + * + * It is possible to get a single string_view instance which represents both the string + * content and its length: see get_string(). + * + * @returns A string length in bytes. + * Returns INCORRECT_TYPE if the JSON element is not a string. + */ + inline simdjson_result get_string_length() const noexcept; + /** + * Cast this element to a string. + * + * The string is guaranteed to be valid UTF-8. + * + * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next time it + * parses a document or when it is destroyed. + * Returns INCORRECT_TYPE if the JSON element is not a string. + */ + inline simdjson_result get_string() const noexcept; + /** + * Cast this element to a signed integer. + * + * @returns A signed 64-bit integer. + * Returns INCORRECT_TYPE if the JSON element is not an integer, or NUMBER_OUT_OF_RANGE + * if it is negative. + */ + inline simdjson_result get_int64() const noexcept; + /** + * Cast this element to an unsigned integer. + * + * @returns An unsigned 64-bit integer. + * Returns INCORRECT_TYPE if the JSON element is not an integer, or NUMBER_OUT_OF_RANGE + * if it is too large. + */ + inline simdjson_result get_uint64() const noexcept; + /** + * Cast this element to a double floating-point. + * + * @returns A double value. + * Returns INCORRECT_TYPE if the JSON element is not a number. + */ + inline simdjson_result get_double() const noexcept; + /** + * Cast this element to a bool. + * + * @returns A bool value. + * Returns INCORRECT_TYPE if the JSON element is not a boolean. + */ + inline simdjson_result get_bool() const noexcept; + + /** + * Whether this element is a json array. + * + * Equivalent to is(). + */ + inline bool is_array() const noexcept; + /** + * Whether this element is a json object. + * + * Equivalent to is(). + */ + inline bool is_object() const noexcept; + /** + * Whether this element is a json string. + * + * Equivalent to is() or is(). + */ + inline bool is_string() const noexcept; + /** + * Whether this element is a json number that fits in a signed 64-bit integer. + * + * Equivalent to is(). + */ + inline bool is_int64() const noexcept; + /** + * Whether this element is a json number that fits in an unsigned 64-bit integer. + * + * Equivalent to is(). + */ + inline bool is_uint64() const noexcept; + /** + * Whether this element is a json number that fits in a double. + * + * Equivalent to is(). + */ + inline bool is_double() const noexcept; + + /** + * Whether this element is a json number. + * + * Both integers and floating points will return true. + */ + inline bool is_number() const noexcept; + + /** + * Whether this element is a json `true` or `false`. + * + * Equivalent to is(). + */ + inline bool is_bool() const noexcept; + /** + * Whether this element is a json `null`. + */ + inline bool is_null() const noexcept; + + /** + * Tell whether the value can be cast to provided type (T). + * + * Supported types: + * - Boolean: bool + * - Number: double, uint64_t, int64_t + * - String: std::string_view, const char * + * - Array: dom::array + * - Object: dom::object + * + * @tparam T bool, double, uint64_t, int64_t, std::string_view, const char *, dom::array, dom::object + */ + template + simdjson_inline bool is() const noexcept; + + /** + * Get the value as the provided type (T). + * + * Supported types: + * - Boolean: bool + * - Number: double, uint64_t, int64_t + * - String: std::string_view, const char * + * - Array: dom::array + * - Object: dom::object + * + * You may use get_double(), get_bool(), get_uint64(), get_int64(), + * get_object(), get_array() or get_string() instead. + * + * @tparam T bool, double, uint64_t, int64_t, std::string_view, const char *, dom::array, dom::object + * + * @returns The value cast to the given type, or: + * INCORRECT_TYPE if the value cannot be cast to the given type. + */ + + template + inline simdjson_result get() const noexcept { + // Unless the simdjson library provides an inline implementation, calling this method should + // immediately fail. + static_assert(!sizeof(T), "The get method with given type is not implemented by the simdjson library. " + "The supported types are Boolean (bool), numbers (double, uint64_t, int64_t), " + "strings (std::string_view, const char *), arrays (dom::array) and objects (dom::object). " + "We recommend you use get_double(), get_bool(), get_uint64(), get_int64(), " + "get_object(), get_array() or get_string() instead of the get template."); + } + + /** + * Get the value as the provided type (T). + * + * Supported types: + * - Boolean: bool + * - Number: double, uint64_t, int64_t + * - String: std::string_view, const char * + * - Array: dom::array + * - Object: dom::object + * + * @tparam T bool, double, uint64_t, int64_t, std::string_view, const char *, dom::array, dom::object + * + * @param value The variable to set to the value. May not be set if there is an error. + * + * @returns The error that occurred, or SUCCESS if there was no error. + */ + template + simdjson_warn_unused simdjson_inline error_code get(T &value) const noexcept; + + /** + * Get the value as the provided type (T), setting error if it's not the given type. + * + * Supported types: + * - Boolean: bool + * - Number: double, uint64_t, int64_t + * - String: std::string_view, const char * + * - Array: dom::array + * - Object: dom::object + * + * @tparam T bool, double, uint64_t, int64_t, std::string_view, const char *, dom::array, dom::object + * + * @param value The variable to set to the given type. value is undefined if there is an error. + * @param error The variable to store the error. error is set to error_code::SUCCEED if there is an error. + */ + template + inline void tie(T &value, error_code &error) && noexcept; + +#if SIMDJSON_EXCEPTIONS + /** + * Read this element as a boolean. + * + * @return The boolean value + * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not a boolean. + */ + inline operator bool() const noexcept(false); + + /** + * Read this element as a null-terminated UTF-8 string. + * + * Be mindful that JSON allows strings to contain null characters. + * + * Does *not* convert other types to a string; requires that the JSON type of the element was + * an actual string. + * + * @return The string value. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not a string. + */ + inline explicit operator const char*() const noexcept(false); + + /** + * Read this element as a null-terminated UTF-8 string. + * + * Does *not* convert other types to a string; requires that the JSON type of the element was + * an actual string. + * + * @return The string value. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not a string. + */ + inline operator std::string_view() const noexcept(false); + + /** + * Read this element as an unsigned integer. + * + * @return The integer value. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an integer + * @exception simdjson_error(NUMBER_OUT_OF_RANGE) if the integer does not fit in 64 bits or is negative + */ + inline operator uint64_t() const noexcept(false); + /** + * Read this element as an signed integer. + * + * @return The integer value. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an integer + * @exception simdjson_error(NUMBER_OUT_OF_RANGE) if the integer does not fit in 64 bits + */ + inline operator int64_t() const noexcept(false); + /** + * Read this element as an double. + * + * @return The double value. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not a number + */ + inline operator double() const noexcept(false); + /** + * Read this element as a JSON array. + * + * @return The JSON array. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an array + */ + inline operator array() const noexcept(false); + /** + * Read this element as a JSON object (key/value pairs). + * + * @return The JSON object. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an object + */ + inline operator object() const noexcept(false); + + /** + * Iterate over each element in this array. + * + * @return The beginning of the iteration. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an array + */ + inline dom::array::iterator begin() const noexcept(false); + + /** + * Iterate over each element in this array. + * + * @return The end of the iteration. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an array + */ + inline dom::array::iterator end() const noexcept(false); +#endif // SIMDJSON_EXCEPTIONS + + /** + * Get the value associated with the given key. + * + * The key will be matched against **unescaped** JSON: + * + * dom::parser parser; + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 + * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD + * + * @return The value associated with this field, or: + * - NO_SUCH_FIELD if the field does not exist in the object + * - INCORRECT_TYPE if this is not an object + */ + inline simdjson_result operator[](std::string_view key) const noexcept; + + /** + * Get the value associated with the given key. + * + * The key will be matched against **unescaped** JSON: + * + * dom::parser parser; + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 + * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD + * + * @return The value associated with this field, or: + * - NO_SUCH_FIELD if the field does not exist in the object + * - INCORRECT_TYPE if this is not an object + */ + inline simdjson_result operator[](const char *key) const noexcept; + + /** + * Get the value associated with the given JSON pointer. We use the RFC 6901 + * https://tools.ietf.org/html/rfc6901 standard. + * + * dom::parser parser; + * element doc = parser.parse(R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded); + * doc.at_pointer("/foo/a/1") == 20 + * doc.at_pointer("/foo")["a"].at(1) == 20 + * doc.at_pointer("")["foo"]["a"].at(1) == 20 + * + * It is allowed for a key to be the empty string: + * + * dom::parser parser; + * object obj = parser.parse(R"({ "": { "a": [ 10, 20, 30 ] }})"_padded); + * obj.at_pointer("//a/1") == 20 + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + */ + inline simdjson_result at_pointer(const std::string_view json_pointer) const noexcept; + +#ifndef SIMDJSON_DISABLE_DEPRECATED_API + /** + * + * Version 0.4 of simdjson used an incorrect interpretation of the JSON Pointer standard + * and allowed the following : + * + * dom::parser parser; + * element doc = parser.parse(R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded); + * doc.at("foo/a/1") == 20 + * + * Though it is intuitive, it is not compliant with RFC 6901 + * https://tools.ietf.org/html/rfc6901 + * + * For standard compliance, use the at_pointer function instead. + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + */ + [[deprecated("For standard compliance, use at_pointer instead, and prefix your pointers with a slash '/', see RFC6901 ")]] + inline simdjson_result at(const std::string_view json_pointer) const noexcept; +#endif // SIMDJSON_DISABLE_DEPRECATED_API + + /** + * Get the value at the given index. + * + * @return The value at the given index, or: + * - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length + */ + inline simdjson_result at(size_t index) const noexcept; + + /** + * Get the value associated with the given key. + * + * The key will be matched against **unescaped** JSON: + * + * dom::parser parser; + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 + * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD + * + * @return The value associated with this field, or: + * - NO_SUCH_FIELD if the field does not exist in the object + */ + inline simdjson_result at_key(std::string_view key) const noexcept; + + /** + * Get the value associated with the given key in a case-insensitive manner. + * + * Note: The key will be matched against **unescaped** JSON. + * + * @return The value associated with this field, or: + * - NO_SUCH_FIELD if the field does not exist in the object + */ + inline simdjson_result at_key_case_insensitive(std::string_view key) const noexcept; + + /** + * operator< defines a total order for element allowing to use them in + * ordered C++ STL containers + * + * @return TRUE if the key appears before the other one in the tape + */ + inline bool operator<(const element &other) const noexcept; + + /** + * operator== allows to verify if two element values reference the + * same JSON item + * + * @return TRUE if the two values references the same JSON element + */ + inline bool operator==(const element &other) const noexcept; + + /** @private for debugging. Prints out the root element. */ + inline bool dump_raw_tape(std::ostream &out) const noexcept; + +private: + simdjson_inline element(const internal::tape_ref &tape) noexcept; + internal::tape_ref tape; + friend class document; + friend class object; + friend class array; + friend struct simdjson_result; + template + friend class simdjson::internal::string_builder; + +}; + +} // namespace dom + +/** The result of a JSON navigation that may fail. */ +template<> +struct simdjson_result : public internal::simdjson_result_base { +public: + simdjson_inline simdjson_result() noexcept; ///< @private + simdjson_inline simdjson_result(dom::element &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + + simdjson_inline simdjson_result type() const noexcept; + template + simdjson_inline bool is() const noexcept; + template + simdjson_inline simdjson_result get() const noexcept; + template + simdjson_warn_unused simdjson_inline error_code get(T &value) const noexcept; + + simdjson_inline simdjson_result get_array() const noexcept; + simdjson_inline simdjson_result get_object() const noexcept; + simdjson_inline simdjson_result get_c_str() const noexcept; + simdjson_inline simdjson_result get_string_length() const noexcept; + simdjson_inline simdjson_result get_string() const noexcept; + simdjson_inline simdjson_result get_int64() const noexcept; + simdjson_inline simdjson_result get_uint64() const noexcept; + simdjson_inline simdjson_result get_double() const noexcept; + simdjson_inline simdjson_result get_bool() const noexcept; + + simdjson_inline bool is_array() const noexcept; + simdjson_inline bool is_object() const noexcept; + simdjson_inline bool is_string() const noexcept; + simdjson_inline bool is_int64() const noexcept; + simdjson_inline bool is_uint64() const noexcept; + simdjson_inline bool is_double() const noexcept; + simdjson_inline bool is_number() const noexcept; + simdjson_inline bool is_bool() const noexcept; + simdjson_inline bool is_null() const noexcept; + + simdjson_inline simdjson_result operator[](std::string_view key) const noexcept; + simdjson_inline simdjson_result operator[](const char *key) const noexcept; + simdjson_inline simdjson_result at_pointer(const std::string_view json_pointer) const noexcept; + [[deprecated("For standard compliance, use at_pointer instead, and prefix your pointers with a slash '/', see RFC6901 ")]] + simdjson_inline simdjson_result at(const std::string_view json_pointer) const noexcept; + simdjson_inline simdjson_result at(size_t index) const noexcept; + simdjson_inline simdjson_result at_key(std::string_view key) const noexcept; + simdjson_inline simdjson_result at_key_case_insensitive(std::string_view key) const noexcept; + +#if SIMDJSON_EXCEPTIONS + simdjson_inline operator bool() const noexcept(false); + simdjson_inline explicit operator const char*() const noexcept(false); + simdjson_inline operator std::string_view() const noexcept(false); + simdjson_inline operator uint64_t() const noexcept(false); + simdjson_inline operator int64_t() const noexcept(false); + simdjson_inline operator double() const noexcept(false); + simdjson_inline operator dom::array() const noexcept(false); + simdjson_inline operator dom::object() const noexcept(false); + + simdjson_inline dom::array::iterator begin() const noexcept(false); + simdjson_inline dom::array::iterator end() const noexcept(false); +#endif // SIMDJSON_EXCEPTIONS +}; + +} // namespace simdjson + +#endif // SIMDJSON_DOM_DOCUMENT_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/object-inl.h b/contrib/libs/simdjson/include/simdjson/dom/object-inl.h new file mode 100644 index 000000000000..c72ab3feb429 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/object-inl.h @@ -0,0 +1,263 @@ +#ifndef SIMDJSON_OBJECT_INL_H +#define SIMDJSON_OBJECT_INL_H + +#include "simdjson/dom/base.h" +#include "simdjson/dom/object.h" +#include "simdjson/dom/document.h" + +#include "simdjson/dom/element-inl.h" +#include "simdjson/error-inl.h" + +#include + +namespace simdjson { + +// +// simdjson_result inline implementation +// +simdjson_inline simdjson_result::simdjson_result() noexcept + : internal::simdjson_result_base() {} +simdjson_inline simdjson_result::simdjson_result(dom::object value) noexcept + : internal::simdjson_result_base(std::forward(value)) {} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : internal::simdjson_result_base(error) {} + +inline simdjson_result simdjson_result::operator[](std::string_view key) const noexcept { + if (error()) { return error(); } + return first[key]; +} +inline simdjson_result simdjson_result::operator[](const char *key) const noexcept { + if (error()) { return error(); } + return first[key]; +} +inline simdjson_result simdjson_result::at_pointer(std::string_view json_pointer) const noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} +inline simdjson_result simdjson_result::at_key(std::string_view key) const noexcept { + if (error()) { return error(); } + return first.at_key(key); +} +inline simdjson_result simdjson_result::at_key_case_insensitive(std::string_view key) const noexcept { + if (error()) { return error(); } + return first.at_key_case_insensitive(key); +} + +#if SIMDJSON_EXCEPTIONS + +inline dom::object::iterator simdjson_result::begin() const noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first.begin(); +} +inline dom::object::iterator simdjson_result::end() const noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first.end(); +} +inline size_t simdjson_result::size() const noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first.size(); +} + +#endif // SIMDJSON_EXCEPTIONS + +namespace dom { + +// +// object inline implementation +// +simdjson_inline object::object() noexcept : tape{} {} +simdjson_inline object::object(const internal::tape_ref &_tape) noexcept : tape{_tape} { } +inline object::iterator object::begin() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + return internal::tape_ref(tape.doc, tape.json_index + 1); +} +inline object::iterator object::end() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + return internal::tape_ref(tape.doc, tape.after_element() - 1); +} +inline size_t object::size() const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + return tape.scope_count(); +} + +inline simdjson_result object::operator[](std::string_view key) const noexcept { + return at_key(key); +} +inline simdjson_result object::operator[](const char *key) const noexcept { + return at_key(key); +} +inline simdjson_result object::at_pointer(std::string_view json_pointer) const noexcept { + SIMDJSON_DEVELOPMENT_ASSERT(tape.usable()); // https://github.com/simdjson/simdjson/issues/1914 + if(json_pointer.empty()) { // an empty string means that we return the current node + return element(this->tape); // copy the current node + } else if(json_pointer[0] != '/') { // otherwise there is an error + return INVALID_JSON_POINTER; + } + json_pointer = json_pointer.substr(1); + size_t slash = json_pointer.find('/'); + std::string_view key = json_pointer.substr(0, slash); + // Grab the child with the given key + simdjson_result child; + + // If there is an escape character in the key, unescape it and then get the child. + size_t escape = key.find('~'); + if (escape != std::string_view::npos) { + // Unescape the key + std::string unescaped(key); + do { + switch (unescaped[escape+1]) { + case '0': + unescaped.replace(escape, 2, "~"); + break; + case '1': + unescaped.replace(escape, 2, "/"); + break; + default: + return INVALID_JSON_POINTER; // "Unexpected ~ escape character in JSON pointer"); + } + escape = unescaped.find('~', escape+1); + } while (escape != std::string::npos); + child = at_key(unescaped); + } else { + child = at_key(key); + } + if(child.error()) { + return child; // we do not continue if there was an error + } + // If there is a /, we have to recurse and look up more of the path + if (slash != std::string_view::npos) { + child = child.at_pointer(json_pointer.substr(slash)); + } + return child; +} + +inline simdjson_result object::at_key(std::string_view key) const noexcept { + iterator end_field = end(); + for (iterator field = begin(); field != end_field; ++field) { + if (field.key_equals(key)) { + return field.value(); + } + } + return NO_SUCH_FIELD; +} +// In case you wonder why we need this, please see +// https://github.com/simdjson/simdjson/issues/323 +// People do seek keys in a case-insensitive manner. +inline simdjson_result object::at_key_case_insensitive(std::string_view key) const noexcept { + iterator end_field = end(); + for (iterator field = begin(); field != end_field; ++field) { + if (field.key_equals_case_insensitive(key)) { + return field.value(); + } + } + return NO_SUCH_FIELD; +} + +inline object::operator element() const noexcept { + return element(tape); +} + +// +// object::iterator inline implementation +// +simdjson_inline object::iterator::iterator(const internal::tape_ref &_tape) noexcept : tape{_tape} { } +inline const key_value_pair object::iterator::operator*() const noexcept { + return key_value_pair(key(), value()); +} +inline bool object::iterator::operator!=(const object::iterator& other) const noexcept { + return tape.json_index != other.tape.json_index; +} +inline bool object::iterator::operator==(const object::iterator& other) const noexcept { + return tape.json_index == other.tape.json_index; +} +inline bool object::iterator::operator<(const object::iterator& other) const noexcept { + return tape.json_index < other.tape.json_index; +} +inline bool object::iterator::operator<=(const object::iterator& other) const noexcept { + return tape.json_index <= other.tape.json_index; +} +inline bool object::iterator::operator>=(const object::iterator& other) const noexcept { + return tape.json_index >= other.tape.json_index; +} +inline bool object::iterator::operator>(const object::iterator& other) const noexcept { + return tape.json_index > other.tape.json_index; +} +inline object::iterator& object::iterator::operator++() noexcept { + tape.json_index++; + tape.json_index = tape.after_element(); + return *this; +} +inline object::iterator object::iterator::operator++(int) noexcept { + object::iterator out = *this; + ++*this; + return out; +} +inline std::string_view object::iterator::key() const noexcept { + return tape.get_string_view(); +} +inline uint32_t object::iterator::key_length() const noexcept { + return tape.get_string_length(); +} +inline const char* object::iterator::key_c_str() const noexcept { + return reinterpret_cast(&tape.doc->string_buf[size_t(tape.tape_value()) + sizeof(uint32_t)]); +} +inline element object::iterator::value() const noexcept { + return element(internal::tape_ref(tape.doc, tape.json_index + 1)); +} + +/** + * Design notes: + * Instead of constructing a string_view and then comparing it with a + * user-provided strings, it is probably more performant to have dedicated + * functions taking as a parameter the string we want to compare against + * and return true when they are equal. That avoids the creation of a temporary + * std::string_view. Though it is possible for the compiler to avoid entirely + * any overhead due to string_view, relying too much on compiler magic is + * problematic: compiler magic sometimes fail, and then what do you do? + * Also, enticing users to rely on high-performance function is probably better + * on the long run. + */ + +inline bool object::iterator::key_equals(std::string_view o) const noexcept { + // We use the fact that the key length can be computed quickly + // without access to the string buffer. + const uint32_t len = key_length(); + if(o.size() == len) { + // We avoid construction of a temporary string_view instance. + return (memcmp(o.data(), key_c_str(), len) == 0); + } + return false; +} + +inline bool object::iterator::key_equals_case_insensitive(std::string_view o) const noexcept { + // We use the fact that the key length can be computed quickly + // without access to the string buffer. + const uint32_t len = key_length(); + if(o.size() == len) { + // See For case-insensitive string comparisons, avoid char-by-char functions + // https://lemire.me/blog/2020/04/30/for-case-insensitive-string-comparisons-avoid-char-by-char-functions/ + // Note that it might be worth rolling our own strncasecmp function, with vectorization. + return (simdjson_strncasecmp(o.data(), key_c_str(), len) == 0); + } + return false; +} +// +// key_value_pair inline implementation +// +inline key_value_pair::key_value_pair(std::string_view _key, element _value) noexcept : + key(_key), value(_value) {} + +} // namespace dom + +} // namespace simdjson + +#if defined(__cpp_lib_ranges) +static_assert(std::ranges::view); +static_assert(std::ranges::sized_range); +#if SIMDJSON_EXCEPTIONS +static_assert(std::ranges::view>); +static_assert(std::ranges::sized_range>); +#endif // SIMDJSON_EXCEPTIONS +#endif // defined(__cpp_lib_ranges) + +#endif // SIMDJSON_OBJECT_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/object.h b/contrib/libs/simdjson/include/simdjson/dom/object.h new file mode 100644 index 000000000000..8f6884baf1ea --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/object.h @@ -0,0 +1,274 @@ +#ifndef SIMDJSON_DOM_OBJECT_H +#define SIMDJSON_DOM_OBJECT_H + +#include "simdjson/dom/base.h" +#include "simdjson/dom/element.h" +#include "simdjson/internal/tape_ref.h" + +namespace simdjson { +namespace dom { + +/** + * JSON object. + */ +class object { +public: + /** Create a new, invalid object */ + simdjson_inline object() noexcept; + + class iterator { + public: + using value_type = const key_value_pair; + using difference_type = std::ptrdiff_t; + using pointer = void; + using reference = value_type; + using iterator_category = std::forward_iterator_tag; + + /** + * Get the actual key/value pair + */ + inline reference operator*() const noexcept; + /** + * Get the next key/value pair. + * + * Part of the std::iterator interface. + * + */ + inline iterator& operator++() noexcept; + /** + * Get the next key/value pair. + * + * Part of the std::iterator interface. + * + */ + inline iterator operator++(int) noexcept; + /** + * Check if these values come from the same place in the JSON. + * + * Part of the std::iterator interface. + */ + inline bool operator!=(const iterator& other) const noexcept; + inline bool operator==(const iterator& other) const noexcept; + + inline bool operator<(const iterator& other) const noexcept; + inline bool operator<=(const iterator& other) const noexcept; + inline bool operator>=(const iterator& other) const noexcept; + inline bool operator>(const iterator& other) const noexcept; + /** + * Get the key of this key/value pair. + */ + inline std::string_view key() const noexcept; + /** + * Get the length (in bytes) of the key in this key/value pair. + * You should expect this function to be faster than key().size(). + */ + inline uint32_t key_length() const noexcept; + /** + * Returns true if the key in this key/value pair is equal + * to the provided string_view. + */ + inline bool key_equals(std::string_view o) const noexcept; + /** + * Returns true if the key in this key/value pair is equal + * to the provided string_view in a case-insensitive manner. + * Case comparisons may only be handled correctly for ASCII strings. + */ + inline bool key_equals_case_insensitive(std::string_view o) const noexcept; + /** + * Get the key of this key/value pair. + */ + inline const char *key_c_str() const noexcept; + /** + * Get the value of this key/value pair. + */ + inline element value() const noexcept; + + iterator() noexcept = default; + iterator(const iterator&) noexcept = default; + iterator& operator=(const iterator&) noexcept = default; + private: + simdjson_inline iterator(const internal::tape_ref &tape) noexcept; + + internal::tape_ref tape; + + friend class object; + }; + + /** + * Return the first key/value pair. + * + * Part of the std::iterable interface. + */ + inline iterator begin() const noexcept; + /** + * One past the last key/value pair. + * + * Part of the std::iterable interface. + */ + inline iterator end() const noexcept; + /** + * Get the size of the object (number of keys). + * It is a saturated value with a maximum of 0xFFFFFF: if the value + * is 0xFFFFFF then the size is 0xFFFFFF or greater. + */ + inline size_t size() const noexcept; + /** + * Get the value associated with the given key. + * + * The key will be matched against **unescaped** JSON: + * + * dom::parser parser; + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 + * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD + * + * This function has linear-time complexity: the keys are checked one by one. + * + * @return The value associated with this field, or: + * - NO_SUCH_FIELD if the field does not exist in the object + * - INCORRECT_TYPE if this is not an object + */ + inline simdjson_result operator[](std::string_view key) const noexcept; + + /** + * Get the value associated with the given key. + * + * The key will be matched against **unescaped** JSON: + * + * dom::parser parser; + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 + * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD + * + * This function has linear-time complexity: the keys are checked one by one. + * + * @return The value associated with this field, or: + * - NO_SUCH_FIELD if the field does not exist in the object + * - INCORRECT_TYPE if this is not an object + */ + inline simdjson_result operator[](const char *key) const noexcept; + + /** + * Get the value associated with the given JSON pointer. We use the RFC 6901 + * https://tools.ietf.org/html/rfc6901 standard, interpreting the current node + * as the root of its own JSON document. + * + * dom::parser parser; + * object obj = parser.parse(R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded); + * obj.at_pointer("/foo/a/1") == 20 + * obj.at_pointer("/foo")["a"].at(1) == 20 + * + * It is allowed for a key to be the empty string: + * + * dom::parser parser; + * object obj = parser.parse(R"({ "": { "a": [ 10, 20, 30 ] }})"_padded); + * obj.at_pointer("//a/1") == 20 + * obj.at_pointer("/")["a"].at(1) == 20 + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + */ + inline simdjson_result at_pointer(std::string_view json_pointer) const noexcept; + + /** + * Get the value associated with the given key. + * + * The key will be matched against **unescaped** JSON: + * + * dom::parser parser; + * int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1 + * parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD + * + * This function has linear-time complexity: the keys are checked one by one. + * + * @return The value associated with this field, or: + * - NO_SUCH_FIELD if the field does not exist in the object + */ + inline simdjson_result at_key(std::string_view key) const noexcept; + + /** + * Get the value associated with the given key in a case-insensitive manner. + * It is only guaranteed to work over ASCII inputs. + * + * Note: The key will be matched against **unescaped** JSON. + * + * This function has linear-time complexity: the keys are checked one by one. + * + * @return The value associated with this field, or: + * - NO_SUCH_FIELD if the field does not exist in the object + */ + inline simdjson_result at_key_case_insensitive(std::string_view key) const noexcept; + + /** + * Implicitly convert object to element + */ + inline operator element() const noexcept; + +private: + simdjson_inline object(const internal::tape_ref &tape) noexcept; + + internal::tape_ref tape; + + friend class element; + friend struct simdjson_result; + template + friend class simdjson::internal::string_builder; +}; + +/** + * Key/value pair in an object. + */ +class key_value_pair { +public: + /** key in the key-value pair **/ + std::string_view key; + /** value in the key-value pair **/ + element value; + +private: + simdjson_inline key_value_pair(std::string_view _key, element _value) noexcept; + friend class object; +}; + +} // namespace dom + +/** The result of a JSON conversion that may fail. */ +template<> +struct simdjson_result : public internal::simdjson_result_base { +public: + simdjson_inline simdjson_result() noexcept; ///< @private + simdjson_inline simdjson_result(dom::object value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + + inline simdjson_result operator[](std::string_view key) const noexcept; + inline simdjson_result operator[](const char *key) const noexcept; + inline simdjson_result at_pointer(std::string_view json_pointer) const noexcept; + inline simdjson_result at_key(std::string_view key) const noexcept; + inline simdjson_result at_key_case_insensitive(std::string_view key) const noexcept; + +#if SIMDJSON_EXCEPTIONS + inline dom::object::iterator begin() const noexcept(false); + inline dom::object::iterator end() const noexcept(false); + inline size_t size() const noexcept(false); +#endif // SIMDJSON_EXCEPTIONS +}; + +} // namespace simdjson + +#if defined(__cpp_lib_ranges) +#include + +namespace std { +namespace ranges { +template<> +inline constexpr bool enable_view = true; +#if SIMDJSON_EXCEPTIONS +template<> +inline constexpr bool enable_view> = true; +#endif // SIMDJSON_EXCEPTIONS +} // namespace ranges +} // namespace std +#endif // defined(__cpp_lib_ranges) + +#endif // SIMDJSON_DOM_OBJECT_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/parser-inl.h b/contrib/libs/simdjson/include/simdjson/dom/parser-inl.h new file mode 100644 index 000000000000..14ba6c83330d --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/parser-inl.h @@ -0,0 +1,258 @@ +#ifndef SIMDJSON_PARSER_INL_H +#define SIMDJSON_PARSER_INL_H + +#include "simdjson/dom/base.h" +#include "simdjson/dom/document_stream.h" +#include "simdjson/implementation.h" +#include "simdjson/internal/dom_parser_implementation.h" + +#include "simdjson/error-inl.h" +#include "simdjson/padded_string-inl.h" +#include "simdjson/dom/document_stream-inl.h" +#include "simdjson/dom/element-inl.h" + +#include +#include /* memcmp */ + +namespace simdjson { +namespace dom { + +// +// parser inline implementation +// +simdjson_inline parser::parser(size_t max_capacity) noexcept + : _max_capacity{max_capacity}, + loaded_bytes(nullptr) { +} +simdjson_inline parser::parser(parser &&other) noexcept = default; +simdjson_inline parser &parser::operator=(parser &&other) noexcept = default; + +inline bool parser::is_valid() const noexcept { return valid; } +inline int parser::get_error_code() const noexcept { return error; } +inline std::string parser::get_error_message() const noexcept { return error_message(error); } + +inline bool parser::dump_raw_tape(std::ostream &os) const noexcept { + return valid ? doc.dump_raw_tape(os) : false; +} + +inline simdjson_result parser::read_file(const std::string &path) noexcept { + // Open the file + SIMDJSON_PUSH_DISABLE_WARNINGS + SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe + std::FILE *fp = std::fopen(path.c_str(), "rb"); + SIMDJSON_POP_DISABLE_WARNINGS + + if (fp == nullptr) { + return IO_ERROR; + } + + // Get the file size + int ret; +#if SIMDJSON_VISUAL_STUDIO && !SIMDJSON_IS_32BITS + ret = _fseeki64(fp, 0, SEEK_END); +#else + ret = std::fseek(fp, 0, SEEK_END); +#endif // _WIN64 + if(ret < 0) { + std::fclose(fp); + return IO_ERROR; + } +#if SIMDJSON_VISUAL_STUDIO && !SIMDJSON_IS_32BITS + __int64 len = _ftelli64(fp); + if(len == -1L) { + std::fclose(fp); + return IO_ERROR; + } +#else + long len = std::ftell(fp); + if((len < 0) || (len == LONG_MAX)) { + std::fclose(fp); + return IO_ERROR; + } +#endif + + // Make sure we have enough capacity to load the file + if (_loaded_bytes_capacity < size_t(len)) { + loaded_bytes.reset( internal::allocate_padded_buffer(len) ); + if (!loaded_bytes) { + std::fclose(fp); + return MEMALLOC; + } + _loaded_bytes_capacity = len; + } + + // Read the string + std::rewind(fp); + size_t bytes_read = std::fread(loaded_bytes.get(), 1, len, fp); + if (std::fclose(fp) != 0 || bytes_read != size_t(len)) { + return IO_ERROR; + } + + return bytes_read; +} + +inline simdjson_result parser::load(const std::string &path) & noexcept { + return load_into_document(doc, path); +} + +inline simdjson_result parser::load_into_document(document& provided_doc, const std::string &path) & noexcept { + size_t len; + auto _error = read_file(path).get(len); + if (_error) { return _error; } + return parse_into_document(provided_doc, loaded_bytes.get(), len, false); +} + +inline simdjson_result parser::load_many(const std::string &path, size_t batch_size) noexcept { + size_t len; + auto _error = read_file(path).get(len); + if (_error) { return _error; } + if(batch_size < MINIMAL_BATCH_SIZE) { batch_size = MINIMAL_BATCH_SIZE; } + return document_stream(*this, reinterpret_cast(loaded_bytes.get()), len, batch_size); +} + +inline simdjson_result parser::parse_into_document(document& provided_doc, const uint8_t *buf, size_t len, bool realloc_if_needed) & noexcept { + // Important: we need to ensure that document has enough capacity. + // Important: It is possible that provided_doc is actually the internal 'doc' within the parser!!! + error_code _error = ensure_capacity(provided_doc, len); + if (_error) { return _error; } + if (realloc_if_needed) { + // Make sure we have enough capacity to copy len bytes + if (!loaded_bytes || _loaded_bytes_capacity < len) { + loaded_bytes.reset( internal::allocate_padded_buffer(len) ); + if (!loaded_bytes) { + return MEMALLOC; + } + _loaded_bytes_capacity = len; + } + std::memcpy(static_cast(loaded_bytes.get()), buf, len); + buf = reinterpret_cast(loaded_bytes.get()); + } + + if((len >= 3) && (std::memcmp(buf, "\xEF\xBB\xBF", 3) == 0)) { + buf += 3; + len -= 3; + } + _error = implementation->parse(buf, len, provided_doc); + + if (_error) { return _error; } + + return provided_doc.root(); +} + +simdjson_inline simdjson_result parser::parse_into_document(document& provided_doc, const char *buf, size_t len, bool realloc_if_needed) & noexcept { + return parse_into_document(provided_doc, reinterpret_cast(buf), len, realloc_if_needed); +} +simdjson_inline simdjson_result parser::parse_into_document(document& provided_doc, const std::string &s) & noexcept { + return parse_into_document(provided_doc, s.data(), s.length(), s.capacity() - s.length() < SIMDJSON_PADDING); +} +simdjson_inline simdjson_result parser::parse_into_document(document& provided_doc, const padded_string &s) & noexcept { + return parse_into_document(provided_doc, s.data(), s.length(), false); +} + + +inline simdjson_result parser::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) & noexcept { + return parse_into_document(doc, buf, len, realloc_if_needed); +} + +simdjson_inline simdjson_result parser::parse(const char *buf, size_t len, bool realloc_if_needed) & noexcept { + return parse(reinterpret_cast(buf), len, realloc_if_needed); +} +simdjson_inline simdjson_result parser::parse(const std::string &s) & noexcept { + return parse(s.data(), s.length(), s.capacity() - s.length() < SIMDJSON_PADDING); +} +simdjson_inline simdjson_result parser::parse(const padded_string &s) & noexcept { + return parse(s.data(), s.length(), false); +} +simdjson_inline simdjson_result parser::parse(const padded_string_view &v) & noexcept { + return parse(v.data(), v.length(), false); +} + +inline simdjson_result parser::parse_many(const uint8_t *buf, size_t len, size_t batch_size) noexcept { + if(batch_size < MINIMAL_BATCH_SIZE) { batch_size = MINIMAL_BATCH_SIZE; } + if((len >= 3) && (std::memcmp(buf, "\xEF\xBB\xBF", 3) == 0)) { + buf += 3; + len -= 3; + } + return document_stream(*this, buf, len, batch_size); +} +inline simdjson_result parser::parse_many(const char *buf, size_t len, size_t batch_size) noexcept { + return parse_many(reinterpret_cast(buf), len, batch_size); +} +inline simdjson_result parser::parse_many(const std::string &s, size_t batch_size) noexcept { + return parse_many(s.data(), s.length(), batch_size); +} +inline simdjson_result parser::parse_many(const padded_string &s, size_t batch_size) noexcept { + return parse_many(s.data(), s.length(), batch_size); +} + +simdjson_inline size_t parser::capacity() const noexcept { + return implementation ? implementation->capacity() : 0; +} +simdjson_inline size_t parser::max_capacity() const noexcept { + return _max_capacity; +} +simdjson_pure simdjson_inline size_t parser::max_depth() const noexcept { + return implementation ? implementation->max_depth() : DEFAULT_MAX_DEPTH; +} + +simdjson_warn_unused +inline error_code parser::allocate(size_t capacity, size_t max_depth) noexcept { + // + // Reallocate implementation if needed + // + error_code err; + if (implementation) { + err = implementation->allocate(capacity, max_depth); + } else { + err = simdjson::get_active_implementation()->create_dom_parser_implementation(capacity, max_depth, implementation); + } + if (err) { return err; } + return SUCCESS; +} + +#ifndef SIMDJSON_DISABLE_DEPRECATED_API +simdjson_warn_unused +inline bool parser::allocate_capacity(size_t capacity, size_t max_depth) noexcept { + return !allocate(capacity, max_depth); +} +#endif // SIMDJSON_DISABLE_DEPRECATED_API + +inline error_code parser::ensure_capacity(size_t desired_capacity) noexcept { + return ensure_capacity(doc, desired_capacity); +} + + +inline error_code parser::ensure_capacity(document& target_document, size_t desired_capacity) noexcept { + // 1. It is wasteful to allocate a document and a parser for documents spanning less than MINIMAL_DOCUMENT_CAPACITY bytes. + // 2. If we allow desired_capacity = 0 then it is possible to exit this function with implementation == nullptr. + if(desired_capacity < MINIMAL_DOCUMENT_CAPACITY) { desired_capacity = MINIMAL_DOCUMENT_CAPACITY; } + // If we don't have enough capacity, (try to) automatically bump it. + // If the document needs allocation, do it too. + // Both in one if statement to minimize unlikely branching. + // + // Note: we must make sure that this function is called if capacity() == 0. We do so because we + // ensure that desired_capacity > 0. + if (simdjson_unlikely(capacity() < desired_capacity || target_document.capacity() < desired_capacity)) { + if (desired_capacity > max_capacity()) { + return error = CAPACITY; + } + error_code err1 = target_document.capacity() < desired_capacity ? target_document.allocate(desired_capacity) : SUCCESS; + error_code err2 = capacity() < desired_capacity ? allocate(desired_capacity, max_depth()) : SUCCESS; + if(err1 != SUCCESS) { return error = err1; } + if(err2 != SUCCESS) { return error = err2; } + } + return SUCCESS; +} + +simdjson_inline void parser::set_max_capacity(size_t max_capacity) noexcept { + if(max_capacity > MINIMAL_DOCUMENT_CAPACITY) { + _max_capacity = max_capacity; + } else { + _max_capacity = MINIMAL_DOCUMENT_CAPACITY; + } +} + +} // namespace dom +} // namespace simdjson + +#endif // SIMDJSON_PARSER_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/parser.h b/contrib/libs/simdjson/include/simdjson/dom/parser.h new file mode 100644 index 000000000000..a22a1a68b16d --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/parser.h @@ -0,0 +1,650 @@ +#ifndef SIMDJSON_DOM_PARSER_H +#define SIMDJSON_DOM_PARSER_H + +#include "simdjson/dom/base.h" +#include "simdjson/dom/document.h" + +namespace simdjson { + +namespace dom { + +/** + * A persistent document parser. + * + * The parser is designed to be reused, holding the internal buffers necessary to do parsing, + * as well as memory for a single document. The parsed document is overwritten on each parse. + * + * This class cannot be copied, only moved, to avoid unintended allocations. + * + * @note Moving a parser instance may invalidate "dom::element" instances. If you need to + * preserve both the "dom::element" instances and the parser, consider wrapping the parser + * instance in a std::unique_ptr instance: + * + * std::unique_ptr parser(new dom::parser{}); + * auto error = parser->load(f).get(root); + * + * You can then move std::unique_ptr safely. + * + * @note This is not thread safe: one parser cannot produce two documents at the same time! + */ +class parser { +public: + /** + * Create a JSON parser. + * + * The new parser will have zero capacity. + * + * @param max_capacity The maximum document length the parser can automatically handle. The parser + * will allocate more capacity on an as needed basis (when it sees documents too big to handle) + * up to this amount. The parser still starts with zero capacity no matter what this number is: + * to allocate an initial capacity, call allocate() after constructing the parser. + * Defaults to SIMDJSON_MAXSIZE_BYTES (the largest single document simdjson can process). + */ + simdjson_inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept; + /** + * Take another parser's buffers and state. + * + * @param other The parser to take. Its capacity is zeroed. + */ + simdjson_inline parser(parser &&other) noexcept; + parser(const parser &) = delete; ///< @private Disallow copying + /** + * Take another parser's buffers and state. + * + * @param other The parser to take. Its capacity is zeroed. + */ + simdjson_inline parser &operator=(parser &&other) noexcept; + parser &operator=(const parser &) = delete; ///< @private Disallow copying + + /** Deallocate the JSON parser. */ + ~parser()=default; + + /** + * Load a JSON document from a file and return a reference to it. + * + * dom::parser parser; + * const element doc = parser.load("jsonexamples/twitter.json"); + * + * The function is eager: the file's content is loaded in memory inside the parser instance + * and immediately parsed. The file can be deleted after the `parser.load` call. + * + * ### IMPORTANT: Document Lifetime + * + * The JSON document still lives in the parser: this is the most efficient way to parse JSON + * documents because it reuses the same buffers, but you *must* use the document before you + * destroy the parser or call parse() again. + * + * Moving the parser instance is safe, but it invalidates the element instances. You may store + * the parser instance without moving it by wrapping it inside an `unique_ptr` instance like + * so: `std::unique_ptr parser(new dom::parser{});`. + * + * ### Parser Capacity + * + * If the parser's current capacity is less than the file length, it will allocate enough capacity + * to handle it (up to max_capacity). + * + * ## Windows and Unicode + * + * Windows users who need to read files with non-ANSI characters in the + * name should set their code page to UTF-8 (65001) before calling this + * function. This should be the default with Windows 11 and better. + * Further, they may use the AreFileApisANSI function to determine whether + * the filename is interpreted using the ANSI or the system default OEM + * codepage, and they may call SetFileApisToOEM accordingly. + * + * @param path The path to load. + * @return The document, or an error: + * - IO_ERROR if there was an error opening or reading the file. + * Be mindful that on some 32-bit systems, + * the file size might be limited to 2 GB. + * - MEMALLOC if the parser does not have enough capacity and memory allocation fails. + * - CAPACITY if the parser does not have enough capacity and len > max_capacity. + * - other json errors if parsing fails. You should not rely on these errors to always the same for the + * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). + */ + inline simdjson_result load(const std::string &path) & noexcept; + inline simdjson_result load(const std::string &path) && = delete ; + + /** + * Load a JSON document from a file into a provide document instance and return a temporary reference to it. + * It is similar to the function `load` except that instead of parsing into the internal + * `document` instance associated with the parser, it allows the user to provide a document + * instance. + * + * dom::parser parser; + * dom::document doc; + * element doc_root = parser.load_into_document(doc, "jsonexamples/twitter.json"); + * + * The function is eager: the file's content is loaded in memory inside the parser instance + * and immediately parsed. The file can be deleted after the `parser.load_into_document` call. + * + * ### IMPORTANT: Document Lifetime + * + * After the call to load_into_document, the parser is no longer needed. + * + * The JSON document lives in the document instance: you must keep the document + * instance alive while you navigate through it (i.e., used the returned value from + * load_into_document). You are encourage to reuse the document instance + * many times with new data to avoid reallocations: + * + * dom::document doc; + * element doc_root1 = parser.load_into_document(doc, "jsonexamples/twitter.json"); + * //... doc_root1 is a pointer inside doc + * element doc_root2 = parser.load_into_document(doc, "jsonexamples/twitter.json"); + * //... doc_root2 is a pointer inside doc + * // at this point doc_root1 is no longer safe + * + * Moving the document instance is safe, but it invalidates the element instances. After + * moving a document, you can recover safe access to the document root with its `root()` method. + * + * @param doc The document instance where the parsed data will be stored (on success). + * @param path The path to load. + * @return The document, or an error: + * - IO_ERROR if there was an error opening or reading the file. + * Be mindful that on some 32-bit systems, + * the file size might be limited to 2 GB. + * - MEMALLOC if the parser does not have enough capacity and memory allocation fails. + * - CAPACITY if the parser does not have enough capacity and len > max_capacity. + * - other json errors if parsing fails. You should not rely on these errors to always the same for the + * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). + */ + inline simdjson_result load_into_document(document& doc, const std::string &path) & noexcept; + inline simdjson_result load_into_document(document& doc, const std::string &path) && =delete; + + /** + * Parse a JSON document and return a temporary reference to it. + * + * dom::parser parser; + * element doc_root = parser.parse(buf, len); + * + * The function eagerly parses the input: the input can be modified and discarded after + * the `parser.parse(buf, len)` call has completed. + * + * ### IMPORTANT: Document Lifetime + * + * The JSON document still lives in the parser: this is the most efficient way to parse JSON + * documents because it reuses the same buffers, but you *must* use the document before you + * destroy the parser or call parse() again. + * + * Moving the parser instance is safe, but it invalidates the element instances. You may store + * the parser instance without moving it by wrapping it inside an `unique_ptr` instance like + * so: `std::unique_ptr parser(new dom::parser{});`. + * + * ### REQUIRED: Buffer Padding + * + * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what + * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you + * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the + * SIMDJSON_PADDING bytes to avoid runtime warnings. + * + * If realloc_if_needed is true (the default), it is assumed that the buffer does *not* have enough padding, + * and it is copied into an enlarged temporary buffer before parsing. Thus the following is safe: + * + * const char *json = R"({"key":"value"})"; + * const size_t json_len = std::strlen(json); + * simdjson::dom::parser parser; + * simdjson::dom::element element = parser.parse(json, json_len); + * + * If you set realloc_if_needed to false (e.g., parser.parse(json, json_len, false)), + * you must provide a buffer with at least SIMDJSON_PADDING extra bytes at the end. + * The benefit of setting realloc_if_needed to false is that you avoid a temporary + * memory allocation and a copy. + * + * The padded bytes may be read. It is not important how you initialize + * these bytes though we recommend a sensible default like null character values or spaces. + * For example, the following low-level code is safe: + * + * const char *json = R"({"key":"value"})"; + * const size_t json_len = std::strlen(json); + * std::unique_ptr padded_json_copy{new char[json_len + SIMDJSON_PADDING]}; + * std::memcpy(padded_json_copy.get(), json, json_len); + * std::memset(padded_json_copy.get() + json_len, '\0', SIMDJSON_PADDING); + * simdjson::dom::parser parser; + * simdjson::dom::element element = parser.parse(padded_json_copy.get(), json_len, false); + * + * ### Parser Capacity + * + * If the parser's current capacity is less than len, it will allocate enough capacity + * to handle it (up to max_capacity). + * + * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless + * realloc_if_needed is true. + * @param len The length of the JSON. + * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding. + * @return An element pointing at the root of the document, or an error: + * - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity, + * and memory allocation fails. + * - CAPACITY if the parser does not have enough capacity and len > max_capacity. + * - other json errors if parsing fails. You should not rely on these errors to always the same for the + * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). + */ + inline simdjson_result parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept; + inline simdjson_result parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete; + /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ + simdjson_inline simdjson_result parse(const char *buf, size_t len, bool realloc_if_needed = true) & noexcept; + simdjson_inline simdjson_result parse(const char *buf, size_t len, bool realloc_if_needed = true) && =delete; + /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ + simdjson_inline simdjson_result parse(const std::string &s) & noexcept; + simdjson_inline simdjson_result parse(const std::string &s) && =delete; + /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ + simdjson_inline simdjson_result parse(const padded_string &s) & noexcept; + simdjson_inline simdjson_result parse(const padded_string &s) && =delete; + /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */ + simdjson_inline simdjson_result parse(const padded_string_view &v) & noexcept; + simdjson_inline simdjson_result parse(const padded_string_view &v) && =delete; + + /** @private We do not want to allow implicit conversion from C string to std::string. */ + simdjson_inline simdjson_result parse(const char *buf) noexcept = delete; + + /** + * Parse a JSON document into a provide document instance and return a temporary reference to it. + * It is similar to the function `parse` except that instead of parsing into the internal + * `document` instance associated with the parser, it allows the user to provide a document + * instance. + * + * dom::parser parser; + * dom::document doc; + * element doc_root = parser.parse_into_document(doc, buf, len); + * + * The function eagerly parses the input: the input can be modified and discarded after + * the `parser.parse(buf, len)` call has completed. + * + * ### IMPORTANT: Document Lifetime + * + * After the call to parse_into_document, the parser is no longer needed. + * + * The JSON document lives in the document instance: you must keep the document + * instance alive while you navigate through it (i.e., used the returned value from + * parse_into_document). You are encourage to reuse the document instance + * many times with new data to avoid reallocations: + * + * dom::document doc; + * element doc_root1 = parser.parse_into_document(doc, buf1, len); + * //... doc_root1 is a pointer inside doc + * element doc_root2 = parser.parse_into_document(doc, buf1, len); + * //... doc_root2 is a pointer inside doc + * // at this point doc_root1 is no longer safe + * + * Moving the document instance is safe, but it invalidates the element instances. After + * moving a document, you can recover safe access to the document root with its `root()` method. + * + * @param doc The document instance where the parsed data will be stored (on success). + * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless + * realloc_if_needed is true. + * @param len The length of the JSON. + * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding. + * @return An element pointing at the root of document, or an error: + * - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity, + * and memory allocation fails. + * - CAPACITY if the parser does not have enough capacity and len > max_capacity. + * - other json errors if parsing fails. You should not rely on these errors to always the same for the + * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). + */ + inline simdjson_result parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept; + inline simdjson_result parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete; + /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */ + simdjson_inline simdjson_result parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) & noexcept; + simdjson_inline simdjson_result parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) && =delete; + /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */ + simdjson_inline simdjson_result parse_into_document(document& doc, const std::string &s) & noexcept; + simdjson_inline simdjson_result parse_into_document(document& doc, const std::string &s) && =delete; + /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */ + simdjson_inline simdjson_result parse_into_document(document& doc, const padded_string &s) & noexcept; + simdjson_inline simdjson_result parse_into_document(document& doc, const padded_string &s) && =delete; + + /** @private We do not want to allow implicit conversion from C string to std::string. */ + simdjson_inline simdjson_result parse_into_document(document& doc, const char *buf) noexcept = delete; + + /** + * Load a file containing many JSON documents. + * + * dom::parser parser; + * for (const element doc : parser.load_many(path)) { + * cout << std::string(doc["title"]) << endl; + * } + * + * The file is loaded in memory and can be safely deleted after the `parser.load_many(path)` + * function has returned. The memory is held by the `parser` instance. + * + * The function is lazy: it may be that no more than one JSON document at a time is parsed. + * And, possibly, no document many have been parsed when the `parser.load_many(path)` function + * returned. + * + * If there is a UTF-8 BOM, the parser skips it. + * + * ### Format + * + * The file must contain a series of one or more JSON documents, concatenated into a single + * buffer, separated by whitespace. It effectively parses until it has a fully valid document, + * then starts parsing the next document at that point. (It does this with more parallelism and + * lookahead than you might think, though.) + * + * Documents that consist of an object or array may omit the whitespace between them, concatenating + * with no separator. documents that consist of a single primitive (i.e. documents that are not + * arrays or objects) MUST be separated with whitespace. + * + * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse. + * Setting batch_size to excessively large or excesively small values may impact negatively the + * performance. + * + * ### Error Handling + * + * All errors are returned during iteration: if there is a global error such as memory allocation, + * it will be yielded as the first result. Iteration always stops after the first error. + * + * As with all other simdjson methods, non-exception error handling is readily available through + * the same interface, requiring you to check the error before using the document: + * + * dom::parser parser; + * dom::document_stream docs; + * auto error = parser.load_many(path).get(docs); + * if (error) { cerr << error << endl; exit(1); } + * for (auto doc : docs) { + * std::string_view title; + * if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); } + * cout << title << endl; + * } + * + * ### Threads + * + * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the + * hood to do some lookahead. + * + * ### Parser Capacity + * + * If the parser's current capacity is less than batch_size, it will allocate enough capacity + * to handle it (up to max_capacity). + * + * @param path File name pointing at the concatenated JSON to parse. + * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet + * spot is cache-related: small enough to fit in cache, yet big enough to + * parse as many documents as possible in one tight loop. + * Defaults to 1MB (as simdjson::dom::DEFAULT_BATCH_SIZE), which has been a reasonable sweet + * spot in our tests. + * If you set the batch_size to a value smaller than simdjson::dom::MINIMAL_BATCH_SIZE + * (currently 32B), it will be replaced by simdjson::dom::MINIMAL_BATCH_SIZE. + * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors: + * - IO_ERROR if there was an error opening or reading the file. + * - MEMALLOC if the parser does not have enough capacity and memory allocation fails. + * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity. + * - other json errors if parsing fails. You should not rely on these errors to always the same for the + * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). + */ + inline simdjson_result load_many(const std::string &path, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; + + /** + * Parse a buffer containing many JSON documents. + * + * dom::parser parser; + * for (element doc : parser.parse_many(buf, len)) { + * cout << std::string(doc["title"]) << endl; + * } + * + * No copy of the input buffer is made. + * + * The function is lazy: it may be that no more than one JSON document at a time is parsed. + * And, possibly, no document many have been parsed when the `parser.load_many(path)` function + * returned. + * + * The caller is responsabile to ensure that the input string data remains unchanged and is + * not deleted during the loop. In particular, the following is unsafe and will not compile: + * + * auto docs = parser.parse_many("[\"temporary data\"]"_padded); + * // here the string "[\"temporary data\"]" may no longer exist in memory + * // the parser instance may not have even accessed the input yet + * for (element doc : docs) { + * cout << std::string(doc["title"]) << endl; + * } + * + * The following is safe: + * + * auto json = "[\"temporary data\"]"_padded; + * auto docs = parser.parse_many(json); + * for (element doc : docs) { + * cout << std::string(doc["title"]) << endl; + * } + * + * If there is a UTF-8 BOM, the parser skips it. + * + * ### Format + * + * The buffer must contain a series of one or more JSON documents, concatenated into a single + * buffer, separated by whitespace. It effectively parses until it has a fully valid document, + * then starts parsing the next document at that point. (It does this with more parallelism and + * lookahead than you might think, though.) + * + * documents that consist of an object or array may omit the whitespace between them, concatenating + * with no separator. documents that consist of a single primitive (i.e. documents that are not + * arrays or objects) MUST be separated with whitespace. + * + * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse. + * Setting batch_size to excessively large or excesively small values may impact negatively the + * performance. + * + * ### Error Handling + * + * All errors are returned during iteration: if there is a global error such as memory allocation, + * it will be yielded as the first result. Iteration always stops after the first error. + * + * As with all other simdjson methods, non-exception error handling is readily available through + * the same interface, requiring you to check the error before using the document: + * + * dom::parser parser; + * dom::document_stream docs; + * auto error = parser.load_many(path).get(docs); + * if (error) { cerr << error << endl; exit(1); } + * for (auto doc : docs) { + * std::string_view title; + * if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); } + * cout << title << endl; + * } + * + * ### REQUIRED: Buffer Padding + * + * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what + * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you + * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the + * SIMDJSON_PADDING bytes to avoid runtime warnings. + * + * ### Threads + * + * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the + * hood to do some lookahead. + * + * ### Parser Capacity + * + * If the parser's current capacity is less than batch_size, it will allocate enough capacity + * to handle it (up to max_capacity). + * + * @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes. + * @param len The length of the concatenated JSON. + * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet + * spot is cache-related: small enough to fit in cache, yet big enough to + * parse as many documents as possible in one tight loop. + * Defaults to 10MB, which has been a reasonable sweet spot in our tests. + * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors: + * - MEMALLOC if the parser does not have enough capacity and memory allocation fails + * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity. + * - other json errors if parsing fails. You should not rely on these errors to always the same for the + * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). + */ + inline simdjson_result parse_many(const uint8_t *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; + /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ + inline simdjson_result parse_many(const char *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; + /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ + inline simdjson_result parse_many(const std::string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; + inline simdjson_result parse_many(const std::string &&s, size_t batch_size) = delete;// unsafe + /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ + inline simdjson_result parse_many(const padded_string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept; + inline simdjson_result parse_many(const padded_string &&s, size_t batch_size) = delete;// unsafe + + /** @private We do not want to allow implicit conversion from C string to std::string. */ + simdjson_result parse_many(const char *buf, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept = delete; + + /** + * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length + * and `max_depth` depth. + * + * @param capacity The new capacity. + * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. + * @return The error, if there is one. + */ + simdjson_warn_unused inline error_code allocate(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept; + +#ifndef SIMDJSON_DISABLE_DEPRECATED_API + /** + * @private deprecated because it returns bool instead of error_code, which is our standard for + * failures. Use allocate() instead. + * + * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length + * and `max_depth` depth. + * + * @param capacity The new capacity. + * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. + * @return true if successful, false if allocation failed. + */ + [[deprecated("Use allocate() instead.")]] + simdjson_warn_unused inline bool allocate_capacity(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept; +#endif // SIMDJSON_DISABLE_DEPRECATED_API + /** + * The largest document this parser can support without reallocating. + * + * @return Current capacity, in bytes. + */ + simdjson_inline size_t capacity() const noexcept; + + /** + * The largest document this parser can automatically support. + * + * The parser may reallocate internal buffers as needed up to this amount. + * + * @return Maximum capacity, in bytes. + */ + simdjson_inline size_t max_capacity() const noexcept; + + /** + * The maximum level of nested object and arrays supported by this parser. + * + * @return Maximum depth, in bytes. + */ + simdjson_pure simdjson_inline size_t max_depth() const noexcept; + + /** + * Set max_capacity. This is the largest document this parser can automatically support. + * + * The parser may reallocate internal buffers as needed up to this amount as documents are passed + * to it. + * + * Note: To avoid limiting the memory to an absurd value, such as zero or two bytes, + * iff you try to set max_capacity to a value lower than MINIMAL_DOCUMENT_CAPACITY, + * then the maximal capacity is set to MINIMAL_DOCUMENT_CAPACITY. + * + * This call will not allocate or deallocate, even if capacity is currently above max_capacity. + * + * @param max_capacity The new maximum capacity, in bytes. + */ + simdjson_inline void set_max_capacity(size_t max_capacity) noexcept; + +#ifdef SIMDJSON_THREADS_ENABLED + /** + * The parser instance can use threads when they are available to speed up some + * operations. It is enabled by default. Changing this attribute will change the + * behavior of the parser for future operations. + */ + bool threaded{true}; +#endif + /** @private Use the new DOM API instead */ + class Iterator; + /** @private Use simdjson_error instead */ + using InvalidJSON [[deprecated("Use simdjson_error instead")]] = simdjson_error; + + /** @private [for benchmarking access] The implementation to use */ + std::unique_ptr implementation{}; + + /** @private Use `if (parser.parse(...).error())` instead */ + bool valid{false}; + /** @private Use `parser.parse(...).error()` instead */ + error_code error{UNINITIALIZED}; + + /** @private Use `parser.parse(...).value()` instead */ + document doc{}; + + /** @private returns true if the document parsed was valid */ + [[deprecated("Use the result of parser.parse() instead")]] + inline bool is_valid() const noexcept; + + /** + * @private return an error code corresponding to the last parsing attempt, see + * simdjson.h will return UNINITIALIZED if no parsing was attempted + */ + [[deprecated("Use the result of parser.parse() instead")]] + inline int get_error_code() const noexcept; + + /** @private return the string equivalent of "get_error_code" */ + [[deprecated("Use error_message() on the result of parser.parse() instead, or cout << error")]] + inline std::string get_error_message() const noexcept; + + /** @private */ + [[deprecated("Use cout << on the result of parser.parse() instead")]] + inline bool print_json(std::ostream &os) const noexcept; + + /** @private Private and deprecated: use `parser.parse(...).doc.dump_raw_tape()` instead */ + inline bool dump_raw_tape(std::ostream &os) const noexcept; + + +private: + /** + * The maximum document length this parser will automatically support. + * + * The parser will not be automatically allocated above this amount. + */ + size_t _max_capacity; + + /** + * The loaded buffer (reused each time load() is called) + */ + std::unique_ptr loaded_bytes; + + /** Capacity of loaded_bytes buffer. */ + size_t _loaded_bytes_capacity{0}; + + // all nodes are stored on the doc.tape using a 64-bit word. + // + // strings, double and ints are stored as + // a 64-bit word with a pointer to the actual value + // + // + // + // for objects or arrays, store [ or { at the beginning and } and ] at the + // end. For the openings ([ or {), we annotate them with a reference to the + // location on the doc.tape of the end, and for then closings (} and ]), we + // annotate them with a reference to the location of the opening + // + // + + /** + * Ensure we have enough capacity to handle at least desired_capacity bytes, + * and auto-allocate if not. This also allocates memory if needed in the + * internal document. + */ + inline error_code ensure_capacity(size_t desired_capacity) noexcept; + /** + * Ensure we have enough capacity to handle at least desired_capacity bytes, + * and auto-allocate if not. This also allocates memory if needed in the + * provided document. + */ + inline error_code ensure_capacity(document& doc, size_t desired_capacity) noexcept; + + /** Read the file into loaded_bytes */ + inline simdjson_result read_file(const std::string &path) noexcept; + + friend class parser::Iterator; + friend class document_stream; + + +}; // class parser + +} // namespace dom +} // namespace simdjson + +#endif // SIMDJSON_DOM_PARSER_H diff --git a/contrib/libs/simdjson/include/simdjson/dom/serialization-inl.h b/contrib/libs/simdjson/include/simdjson/dom/serialization-inl.h new file mode 100644 index 000000000000..0c52a26cb1c4 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/serialization-inl.h @@ -0,0 +1,536 @@ + +#ifndef SIMDJSON_SERIALIZATION_INL_H +#define SIMDJSON_SERIALIZATION_INL_H + +#include "simdjson/dom/base.h" +#include "simdjson/dom/serialization.h" +#include "simdjson/dom/parser.h" +#include "simdjson/internal/tape_type.h" + +#include "simdjson/dom/array-inl.h" +#include "simdjson/dom/object-inl.h" +#include "simdjson/internal/tape_ref-inl.h" + +#include + +namespace simdjson { +namespace dom { +inline bool parser::print_json(std::ostream &os) const noexcept { + if (!valid) { return false; } + simdjson::internal::string_builder<> sb; + sb.append(doc.root()); + std::string_view answer = sb.str(); + os << answer; + return true; +} + +inline std::ostream& operator<<(std::ostream& out, simdjson::dom::element value) { + simdjson::internal::string_builder<> sb; + sb.append(value); + return (out << sb.str()); +} +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +#endif +inline std::ostream& operator<<(std::ostream& out, simdjson::dom::array value) { + simdjson::internal::string_builder<> sb; + sb.append(value); + return (out << sb.str()); +} +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +#endif +inline std::ostream& operator<<(std::ostream& out, simdjson::dom::object value) { + simdjson::internal::string_builder<> sb; + sb.append(value); + return (out << sb.str()); +} +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +#endif + +} // namespace dom + +/*** + * Number utility functions + **/ +namespace { +/**@private + * Escape sequence like \b or \u0001 + * We expect that most compilers will use 8 bytes for this data structure. + **/ +struct escape_sequence { + uint8_t length; + const char string[7]; // technically, we only ever need 6 characters, we pad to 8 +}; +/**@private + * This converts a signed integer into a character sequence. + * The caller is responsible for providing enough memory (at least + * 20 characters.) + * Though various runtime libraries provide itoa functions, + * it is not part of the C++ standard. The C++17 standard + * adds the to_chars functions which would do as well, but + * we want to support C++11. + */ +static char *fast_itoa(char *output, int64_t value) noexcept { + // This is a standard implementation of itoa. + char buffer[20]; + uint64_t value_positive; + // In general, negating a signed integer is unsafe. + if(value < 0) { + *output++ = '-'; + // Doing value_positive = -value; while avoiding + // undefined behavior warnings. + // It assumes two complement's which is universal at this + // point in time. + std::memcpy(&value_positive, &value, sizeof(value)); + value_positive = (~value_positive) + 1; // this is a negation + } else { + value_positive = value; + } + // We work solely with value_positive. It *might* be easier + // for an optimizing compiler to deal with an unsigned variable + // as far as performance goes. + const char *const end_buffer = buffer + 20; + char *write_pointer = buffer + 19; + // A faster approach is possible if we expect large integers: + // unroll the loop (work in 100s, 1000s) and use some kind of + // memoization. + while(value_positive >= 10) { + *write_pointer-- = char('0' + (value_positive % 10)); + value_positive /= 10; + } + *write_pointer = char('0' + value_positive); + size_t len = end_buffer - write_pointer; + std::memcpy(output, write_pointer, len); + return output + len; +} +/**@private + * This converts an unsigned integer into a character sequence. + * The caller is responsible for providing enough memory (at least + * 19 characters.) + * Though various runtime libraries provide itoa functions, + * it is not part of the C++ standard. The C++17 standard + * adds the to_chars functions which would do as well, but + * we want to support C++11. + */ +static char *fast_itoa(char *output, uint64_t value) noexcept { + // This is a standard implementation of itoa. + char buffer[20]; + const char *const end_buffer = buffer + 20; + char *write_pointer = buffer + 19; + // A faster approach is possible if we expect large integers: + // unroll the loop (work in 100s, 1000s) and use some kind of + // memoization. + while(value >= 10) { + *write_pointer-- = char('0' + (value % 10)); + value /= 10; + }; + *write_pointer = char('0' + value); + size_t len = end_buffer - write_pointer; + std::memcpy(output, write_pointer, len); + return output + len; +} + + +} // anonymous namespace +namespace internal { + +/*** + * Minifier/formatter code. + **/ + +template +simdjson_inline void base_formatter::number(uint64_t x) { + char number_buffer[24]; + char *newp = fast_itoa(number_buffer, x); + buffer.insert(buffer.end(), number_buffer, newp); +} + +template +simdjson_inline void base_formatter::number(int64_t x) { + char number_buffer[24]; + char *newp = fast_itoa(number_buffer, x); + buffer.insert(buffer.end(), number_buffer, newp); +} + +template +simdjson_inline void base_formatter::number(double x) { + char number_buffer[24]; + // Currently, passing the nullptr to the second argument is + // safe because our implementation does not check the second + // argument. + char *newp = internal::to_chars(number_buffer, nullptr, x); + buffer.insert(buffer.end(), number_buffer, newp); +} + +template +simdjson_inline void base_formatter::start_array() { one_char('['); } + + +template +simdjson_inline void base_formatter::end_array() { one_char(']'); } + +template +simdjson_inline void base_formatter::start_object() { one_char('{'); } + +template +simdjson_inline void base_formatter::end_object() { one_char('}'); } + +template +simdjson_inline void base_formatter::comma() { one_char(','); } + +template +simdjson_inline void base_formatter::true_atom() { + const char * s = "true"; + buffer.insert(buffer.end(), s, s + 4); +} + +template +simdjson_inline void base_formatter::false_atom() { + const char * s = "false"; + buffer.insert(buffer.end(), s, s + 5); +} + +template +simdjson_inline void base_formatter::null_atom() { + const char * s = "null"; + buffer.insert(buffer.end(), s, s + 4); +} + +template +simdjson_inline void base_formatter::one_char(char c) { buffer.push_back(c); } + +template +simdjson_inline void base_formatter::key(std::string_view unescaped) { + string(unescaped); + one_char(':'); +} + +template +simdjson_inline void base_formatter::string(std::string_view unescaped) { + one_char('\"'); + size_t i = 0; + // Fast path for the case where we have no control character, no ", and no backslash. + // This should include most keys. + // + // We would like to use 'bool' but some compilers take offense to bitwise operation + // with bool types. + constexpr static char needs_escaping[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + for(;i + 8 <= unescaped.length(); i += 8) { + // Poor's man vectorization. This could get much faster if we used SIMD. + // + // It is not the case that replacing '|' with '||' would be neutral performance-wise. + if(needs_escaping[uint8_t(unescaped[i])] | needs_escaping[uint8_t(unescaped[i+1])] + | needs_escaping[uint8_t(unescaped[i+2])] | needs_escaping[uint8_t(unescaped[i+3])] + | needs_escaping[uint8_t(unescaped[i+4])] | needs_escaping[uint8_t(unescaped[i+5])] + | needs_escaping[uint8_t(unescaped[i+6])] | needs_escaping[uint8_t(unescaped[i+7])] + ) { break; } + } + for(;i < unescaped.length(); i++) { + if(needs_escaping[uint8_t(unescaped[i])]) { break; } + } + // The following is also possible and omits a 256-byte table, but it is slower: + // for (; (i < unescaped.length()) && (uint8_t(unescaped[i]) > 0x1F) + // && (unescaped[i] != '\"') && (unescaped[i] != '\\'); i++) {} + + // At least for long strings, the following should be fast. We could + // do better by integrating the checks and the insertion. + buffer.insert(buffer.end(), unescaped.data(), unescaped.data() + i); + // We caught a control character if we enter this loop (slow). + // Note that we are do not restart from the beginning, but rather we continue + // from the point where we encountered something that requires escaping. + for (; i < unescaped.length(); i++) { + switch (unescaped[i]) { + case '\"': + { + const char * s = "\\\""; + buffer.insert(buffer.end(), s, s + 2); + } + break; + case '\\': + { + const char * s = "\\\\"; + buffer.insert(buffer.end(), s, s + 2); + } + break; + default: + if (uint8_t(unescaped[i]) <= 0x1F) { + // If packed, this uses 8 * 32 bytes. + // Note that we expect most compilers to embed this code in the data + // section. + constexpr static escape_sequence escaped[32] = { + {6, "\\u0000"}, {6, "\\u0001"}, {6, "\\u0002"}, {6, "\\u0003"}, + {6, "\\u0004"}, {6, "\\u0005"}, {6, "\\u0006"}, {6, "\\u0007"}, + {2, "\\b"}, {2, "\\t"}, {2, "\\n"}, {6, "\\u000b"}, + {2, "\\f"}, {2, "\\r"}, {6, "\\u000e"}, {6, "\\u000f"}, + {6, "\\u0010"}, {6, "\\u0011"}, {6, "\\u0012"}, {6, "\\u0013"}, + {6, "\\u0014"}, {6, "\\u0015"}, {6, "\\u0016"}, {6, "\\u0017"}, + {6, "\\u0018"}, {6, "\\u0019"}, {6, "\\u001a"}, {6, "\\u001b"}, + {6, "\\u001c"}, {6, "\\u001d"}, {6, "\\u001e"}, {6, "\\u001f"}}; + auto u = escaped[uint8_t(unescaped[i])]; + buffer.insert(buffer.end(), u.string, u.string + u.length); + } else { + one_char(unescaped[i]); + } + } // switch + } // for + one_char('\"'); +} + + +template +inline void base_formatter::clear() { + buffer.clear(); +} + +template +simdjson_inline std::string_view base_formatter::str() const { + return std::string_view(buffer.data(), buffer.size()); +} + +simdjson_inline void mini_formatter::print_newline() { + return; +} + +simdjson_inline void mini_formatter::print_indents(size_t depth) { + (void)depth; + return; +} + +simdjson_inline void mini_formatter::print_space() { + return; +} + +simdjson_inline void pretty_formatter::print_newline() { + one_char('\n'); +} + +simdjson_inline void pretty_formatter::print_indents(size_t depth) { + if(this->indent_step <= 0) { + return; + } + for(size_t i = 0; i < this->indent_step * depth; i++) { + one_char(' '); + } +} + +simdjson_inline void pretty_formatter::print_space() { + one_char(' '); +} + +/*** + * String building code. + **/ + +template +inline void string_builder::append(simdjson::dom::element value) { + // using tape_type = simdjson::internal::tape_type; + size_t depth = 0; + constexpr size_t MAX_DEPTH = 16; + bool is_object[MAX_DEPTH]; + is_object[0] = false; + bool after_value = false; + + internal::tape_ref iter(value.tape); + do { + // print commas after each value + if (after_value) { + format.comma(); + format.print_newline(); + } + + format.print_indents(depth); + + // If we are in an object, print the next key and :, and skip to the next + // value. + if (is_object[depth]) { + format.key(iter.get_string_view()); + format.print_space(); + iter.json_index++; + } + switch (iter.tape_ref_type()) { + + // Arrays + case tape_type::START_ARRAY: { + // If we're too deep, we need to recurse to go deeper. + depth++; + if (simdjson_unlikely(depth >= MAX_DEPTH)) { + append(simdjson::dom::array(iter)); + iter.json_index = iter.matching_brace_index() - 1; // Jump to the ] + depth--; + break; + } + + // Output start [ + format.start_array(); + iter.json_index++; + + // Handle empty [] (we don't want to come back around and print commas) + if (iter.tape_ref_type() == tape_type::END_ARRAY) { + format.end_array(); + depth--; + break; + } + + is_object[depth] = false; + after_value = false; + format.print_newline(); + continue; + } + + // Objects + case tape_type::START_OBJECT: { + // If we're too deep, we need to recurse to go deeper. + depth++; + if (simdjson_unlikely(depth >= MAX_DEPTH)) { + append(simdjson::dom::object(iter)); + iter.json_index = iter.matching_brace_index() - 1; // Jump to the } + depth--; + break; + } + + // Output start { + format.start_object(); + iter.json_index++; + + // Handle empty {} (we don't want to come back around and print commas) + if (iter.tape_ref_type() == tape_type::END_OBJECT) { + format.end_object(); + depth--; + break; + } + + is_object[depth] = true; + after_value = false; + format.print_newline(); + continue; + } + + // Scalars + case tape_type::STRING: + format.string(iter.get_string_view()); + break; + case tape_type::INT64: + format.number(iter.next_tape_value()); + iter.json_index++; // numbers take up 2 spots, so we need to increment + // extra + break; + case tape_type::UINT64: + format.number(iter.next_tape_value()); + iter.json_index++; // numbers take up 2 spots, so we need to increment + // extra + break; + case tape_type::DOUBLE: + format.number(iter.next_tape_value()); + iter.json_index++; // numbers take up 2 spots, so we need to increment + // extra + break; + case tape_type::TRUE_VALUE: + format.true_atom(); + break; + case tape_type::FALSE_VALUE: + format.false_atom(); + break; + case tape_type::NULL_VALUE: + format.null_atom(); + break; + + // These are impossible + case tape_type::END_ARRAY: + case tape_type::END_OBJECT: + case tape_type::ROOT: + SIMDJSON_UNREACHABLE(); + } + iter.json_index++; + after_value = true; + + // Handle multiple ends in a row + while (depth != 0 && (iter.tape_ref_type() == tape_type::END_ARRAY || + iter.tape_ref_type() == tape_type::END_OBJECT)) { + format.print_newline(); + depth--; + format.print_indents(depth); + if (iter.tape_ref_type() == tape_type::END_ARRAY) { + format.end_array(); + } else { + format.end_object(); + } + iter.json_index++; + } + + // Stop when we're at depth 0 + } while (depth != 0); + + format.print_newline(); +} + +template +inline void string_builder::append(simdjson::dom::object value) { + format.start_object(); + auto pair = value.begin(); + auto end = value.end(); + if (pair != end) { + append(*pair); + for (++pair; pair != end; ++pair) { + format.comma(); + append(*pair); + } + } + format.end_object(); +} + +template +inline void string_builder::append(simdjson::dom::array value) { + format.start_array(); + auto iter = value.begin(); + auto end = value.end(); + if (iter != end) { + append(*iter); + for (++iter; iter != end; ++iter) { + format.comma(); + append(*iter); + } + } + format.end_array(); +} + +template +simdjson_inline void string_builder::append(simdjson::dom::key_value_pair kv) { + format.key(kv.key); + append(kv.value); +} + +template +simdjson_inline void string_builder::clear() { + format.clear(); +} + +template +simdjson_inline std::string_view string_builder::str() const { + return format.str(); +} + + +} // namespace internal +} // namespace simdjson + +#endif diff --git a/contrib/libs/simdjson/include/simdjson/dom/serialization.h b/contrib/libs/simdjson/include/simdjson/dom/serialization.h new file mode 100644 index 000000000000..87c735bbbd81 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/dom/serialization.h @@ -0,0 +1,260 @@ +#ifndef SIMDJSON_SERIALIZATION_H +#define SIMDJSON_SERIALIZATION_H + +#include "simdjson/dom/base.h" +#include "simdjson/dom/element.h" +#include "simdjson/dom/object.h" + +#include + +namespace simdjson { + +/** + * The string_builder template and mini_formatter class + * are not part of our public API and are subject to change + * at any time! + */ +namespace internal { + +template +class base_formatter { +public: + /** Add a comma **/ + simdjson_inline void comma(); + /** Start an array, prints [ **/ + simdjson_inline void start_array(); + /** End an array, prints ] **/ + simdjson_inline void end_array(); + /** Start an array, prints { **/ + simdjson_inline void start_object(); + /** Start an array, prints } **/ + simdjson_inline void end_object(); + /** Prints a true **/ + simdjson_inline void true_atom(); + /** Prints a false **/ + simdjson_inline void false_atom(); + /** Prints a null **/ + simdjson_inline void null_atom(); + /** Prints a number **/ + simdjson_inline void number(int64_t x); + /** Prints a number **/ + simdjson_inline void number(uint64_t x); + /** Prints a number **/ + simdjson_inline void number(double x); + /** Prints a key (string + colon) **/ + simdjson_inline void key(std::string_view unescaped); + /** Prints a string. The string is escaped as needed. **/ + simdjson_inline void string(std::string_view unescaped); + /** Clears out the content. **/ + simdjson_inline void clear(); + /** + * Get access to the buffer, it is owned by the instance, but + * the user can make a copy. + **/ + simdjson_inline std::string_view str() const; + + /** Prints one character **/ + simdjson_inline void one_char(char c); + + simdjson_inline void call_print_newline() { + static_cast(this)->print_newline(); + } + + simdjson_inline void call_print_indents(size_t depth) { + static_cast(this)->print_indents(depth); + } + + simdjson_inline void call_print_space() { + static_cast(this)->print_space(); + } + +protected: + // implementation details (subject to change) + /** Backing buffer **/ + std::vector buffer{}; // not ideal! +}; + + +/** + * @private This is the class that we expect to use with the string_builder + * template. It tries to produce a compact version of the JSON element + * as quickly as possible. + */ +class mini_formatter : public base_formatter { +public: + simdjson_inline void print_newline(); + + simdjson_inline void print_indents(size_t depth); + + simdjson_inline void print_space(); +}; + +class pretty_formatter : public base_formatter { +public: + simdjson_inline void print_newline(); + + simdjson_inline void print_indents(size_t depth); + + simdjson_inline void print_space(); + +protected: + int indent_step = 4; +}; + +/** + * @private The string_builder template allows us to construct + * a string from a document element. It is parametrized + * by a "formatter" which handles the details. Thus + * the string_builder template could support both minification + * and prettification, and various other tradeoffs. + */ +template +class string_builder { +public: + /** Construct an initially empty builder, would print the empty string **/ + string_builder() = default; + /** Append an element to the builder (to be printed) **/ + inline void append(simdjson::dom::element value); + /** Append an array to the builder (to be printed) **/ + inline void append(simdjson::dom::array value); + /** Append an object to the builder (to be printed) **/ + inline void append(simdjson::dom::object value); + /** Reset the builder (so that it would print the empty string) **/ + simdjson_inline void clear(); + /** + * Get access to the string. The string_view is owned by the builder + * and it is invalid to use it after the string_builder has been + * destroyed. + * However you can make a copy of the string_view on memory that you + * own. + */ + simdjson_inline std::string_view str() const; + /** Append a key_value_pair to the builder (to be printed) **/ + simdjson_inline void append(simdjson::dom::key_value_pair value); +private: + formatter format{}; +}; + +} // internal + +namespace dom { + +/** + * Print JSON to an output stream. + * + * @param out The output stream. + * @param value The element. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, simdjson::dom::element value); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x); +#endif +/** + * Print JSON to an output stream. + * + * @param out The output stream. + * @param value The array. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, simdjson::dom::array value); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x); +#endif +/** + * Print JSON to an output stream. + * + * @param out The output stream. + * @param value The object. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, simdjson::dom::object value); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x); +#endif +} // namespace dom + +/** + * Converts JSON to a string. + * + * dom::parser parser; + * element doc = parser.parse(" [ 1 , 2 , 3 ] "_padded); + * cout << to_string(doc) << endl; // prints [1,2,3] + * + */ +template +std::string to_string(T x) { + // in C++, to_string is standard: http://www.cplusplus.com/reference/string/to_string/ + // Currently minify and to_string are identical but in the future, they may + // differ. + simdjson::internal::string_builder<> sb; + sb.append(x); + std::string_view answer = sb.str(); + return std::string(answer.data(), answer.size()); +} +#if SIMDJSON_EXCEPTIONS +template +std::string to_string(simdjson_result x) { + if (x.error()) { throw simdjson_error(x.error()); } + return to_string(x.value()); +} +#endif + +/** + * Minifies a JSON element or document, printing the smallest possible valid JSON. + * + * dom::parser parser; + * element doc = parser.parse(" [ 1 , 2 , 3 ] "_padded); + * cout << minify(doc) << endl; // prints [1,2,3] + * + */ +template +std::string minify(T x) { + return to_string(x); +} + +#if SIMDJSON_EXCEPTIONS +template +std::string minify(simdjson_result x) { + if (x.error()) { throw simdjson_error(x.error()); } + return to_string(x.value()); +} +#endif + +/** + * Prettifies a JSON element or document, printing the valid JSON with indentation. + * + * dom::parser parser; + * element doc = parser.parse(" [ 1 , 2 , 3 ] "_padded); + * + * // Prints: + * // { + * // [ + * // 1, + * // 2, + * // 3 + * // ] + * // } + * cout << prettify(doc) << endl; + * + */ +template +std::string prettify(T x) { + simdjson::internal::string_builder sb; + sb.append(x); + std::string_view answer = sb.str(); + return std::string(answer.data(), answer.size()); +} + +#if SIMDJSON_EXCEPTIONS +template +std::string prettify(simdjson_result x) { + if (x.error()) { throw simdjson_error(x.error()); } + return to_string(x.value()); +} +#endif + +} // namespace simdjson + + +#endif diff --git a/contrib/libs/simdjson/include/simdjson/error-inl.h b/contrib/libs/simdjson/include/simdjson/error-inl.h new file mode 100644 index 000000000000..5cf61d9c3076 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/error-inl.h @@ -0,0 +1,184 @@ +#ifndef SIMDJSON_ERROR_INL_H +#define SIMDJSON_ERROR_INL_H + +#include "simdjson/error.h" + +#include + +namespace simdjson { +namespace internal { + // We store the error code so we can validate the error message is associated with the right code + struct error_code_info { + error_code code; + const char* message; // do not use a fancy std::string where a simple C string will do (no alloc, no destructor) + }; + // These MUST match the codes in error_code. We check this constraint in basictests. + extern SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[]; +} // namespace internal + + +inline const char *error_message(error_code error) noexcept { + // If you're using error_code, we're trusting you got it from the enum. + return internal::error_codes[int(error)].message; +} + +// deprecated function +#ifndef SIMDJSON_DISABLE_DEPRECATED_API +inline const std::string error_message(int error) noexcept { + if (error < 0 || error >= error_code::NUM_ERROR_CODES) { + return internal::error_codes[UNEXPECTED_ERROR].message; + } + return internal::error_codes[error].message; +} +#endif // SIMDJSON_DISABLE_DEPRECATED_API + +inline std::ostream& operator<<(std::ostream& out, error_code error) noexcept { + return out << error_message(error); +} + +namespace internal { + +// +// internal::simdjson_result_base inline implementation +// + +template +simdjson_inline void simdjson_result_base::tie(T &value, error_code &error) && noexcept { + error = this->second; + if (!error) { + value = std::forward>(*this).first; + } +} + +template +simdjson_warn_unused simdjson_inline error_code simdjson_result_base::get(T &value) && noexcept { + error_code error; + std::forward>(*this).tie(value, error); + return error; +} + +template +simdjson_inline error_code simdjson_result_base::error() const noexcept { + return this->second; +} + +#if SIMDJSON_EXCEPTIONS + +template +simdjson_inline T& simdjson_result_base::value() & noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return this->first; +} + +template +simdjson_inline T&& simdjson_result_base::value() && noexcept(false) { + return std::forward>(*this).take_value(); +} + +template +simdjson_inline T&& simdjson_result_base::take_value() && noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return std::forward(this->first); +} + +template +simdjson_inline simdjson_result_base::operator T&&() && noexcept(false) { + return std::forward>(*this).take_value(); +} + +#endif // SIMDJSON_EXCEPTIONS + +template +simdjson_inline const T& simdjson_result_base::value_unsafe() const& noexcept { + return this->first; +} + +template +simdjson_inline T&& simdjson_result_base::value_unsafe() && noexcept { + return std::forward(this->first); +} + +template +simdjson_inline simdjson_result_base::simdjson_result_base(T &&value, error_code error) noexcept + : std::pair(std::forward(value), error) {} +template +simdjson_inline simdjson_result_base::simdjson_result_base(error_code error) noexcept + : simdjson_result_base(T{}, error) {} +template +simdjson_inline simdjson_result_base::simdjson_result_base(T &&value) noexcept + : simdjson_result_base(std::forward(value), SUCCESS) {} +template +simdjson_inline simdjson_result_base::simdjson_result_base() noexcept + : simdjson_result_base(T{}, UNINITIALIZED) {} + +} // namespace internal + +/// +/// simdjson_result inline implementation +/// + +template +simdjson_inline void simdjson_result::tie(T &value, error_code &error) && noexcept { + std::forward>(*this).tie(value, error); +} + +template +simdjson_warn_unused simdjson_inline error_code simdjson_result::get(T &value) && noexcept { + return std::forward>(*this).get(value); +} + +template +simdjson_inline error_code simdjson_result::error() const noexcept { + return internal::simdjson_result_base::error(); +} + +#if SIMDJSON_EXCEPTIONS + +template +simdjson_inline T& simdjson_result::value() & noexcept(false) { + return internal::simdjson_result_base::value(); +} + +template +simdjson_inline T&& simdjson_result::value() && noexcept(false) { + return std::forward>(*this).value(); +} + +template +simdjson_inline T&& simdjson_result::take_value() && noexcept(false) { + return std::forward>(*this).take_value(); +} + +template +simdjson_inline simdjson_result::operator T&&() && noexcept(false) { + return std::forward>(*this).take_value(); +} + +#endif // SIMDJSON_EXCEPTIONS + +template +simdjson_inline const T& simdjson_result::value_unsafe() const& noexcept { + return internal::simdjson_result_base::value_unsafe(); +} + +template +simdjson_inline T&& simdjson_result::value_unsafe() && noexcept { + return std::forward>(*this).value_unsafe(); +} + +template +simdjson_inline simdjson_result::simdjson_result(T &&value, error_code error) noexcept + : internal::simdjson_result_base(std::forward(value), error) {} +template +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : internal::simdjson_result_base(error) {} +template +simdjson_inline simdjson_result::simdjson_result(T &&value) noexcept + : internal::simdjson_result_base(std::forward(value)) {} +template +simdjson_inline simdjson_result::simdjson_result() noexcept + : internal::simdjson_result_base() {} + +} // namespace simdjson + +#endif // SIMDJSON_ERROR_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/error.h b/contrib/libs/simdjson/include/simdjson/error.h new file mode 100644 index 000000000000..4848f6ef9127 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/error.h @@ -0,0 +1,318 @@ +#ifndef SIMDJSON_ERROR_H +#define SIMDJSON_ERROR_H + +#include "simdjson/base.h" + +#include +#include + +namespace simdjson { + +/** + * All possible errors returned by simdjson. These error codes are subject to change + * and not all simdjson kernel returns the same error code given the same input: it is not + * well defined which error a given input should produce. + * + * Only SUCCESS evaluates to false as a Boolean. All other error codes will evaluate + * to true as a Boolean. + */ +enum error_code { + SUCCESS = 0, ///< No error + CAPACITY, ///< This parser can't support a document that big + MEMALLOC, ///< Error allocating memory, most likely out of memory + TAPE_ERROR, ///< Something went wrong, this is a generic error + DEPTH_ERROR, ///< Your document exceeds the user-specified depth limitation + STRING_ERROR, ///< Problem while parsing a string + T_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 't' + F_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 'f' + N_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 'n' + NUMBER_ERROR, ///< Problem while parsing a number + BIGINT_ERROR, ///< The integer value exceeds 64 bits + UTF8_ERROR, ///< the input is not valid UTF-8 + UNINITIALIZED, ///< unknown error, or uninitialized document + EMPTY, ///< no structural element found + UNESCAPED_CHARS, ///< found unescaped characters in a string. + UNCLOSED_STRING, ///< missing quote at the end + UNSUPPORTED_ARCHITECTURE, ///< unsupported architecture + INCORRECT_TYPE, ///< JSON element has a different type than user expected + NUMBER_OUT_OF_RANGE, ///< JSON number does not fit in 64 bits + INDEX_OUT_OF_BOUNDS, ///< JSON array index too large + NO_SUCH_FIELD, ///< JSON field not found in object + IO_ERROR, ///< Error reading a file + INVALID_JSON_POINTER, ///< Invalid JSON pointer syntax + INVALID_URI_FRAGMENT, ///< Invalid URI fragment + UNEXPECTED_ERROR, ///< indicative of a bug in simdjson + PARSER_IN_USE, ///< parser is already in use. + OUT_OF_ORDER_ITERATION, ///< tried to iterate an array or object out of order (checked when SIMDJSON_DEVELOPMENT_CHECKS=1) + INSUFFICIENT_PADDING, ///< The JSON doesn't have enough padding for simdjson to safely parse it. + INCOMPLETE_ARRAY_OR_OBJECT, ///< The document ends early. + SCALAR_DOCUMENT_AS_VALUE, ///< A scalar document is treated as a value. + OUT_OF_BOUNDS, ///< Attempted to access location outside of document. + TRAILING_CONTENT, ///< Unexpected trailing content in the JSON input + NUM_ERROR_CODES +}; + +/** + * It is the convention throughout the code that the macro SIMDJSON_DEVELOPMENT_CHECKS determines whether + * we check for OUT_OF_ORDER_ITERATION. The logic behind it is that these errors only occurs when the code + * that was written while breaking some simdjson::ondemand requirement. They should not occur in released + * code after these issues were fixed. + */ + +/** + * Get the error message for the given error code. + * + * dom::parser parser; + * dom::element doc; + * auto error = parser.parse("foo",3).get(doc); + * if (error) { printf("Error: %s\n", error_message(error)); } + * + * @return The error message. + */ +inline const char *error_message(error_code error) noexcept; + +/** + * Write the error message to the output stream + */ +inline std::ostream& operator<<(std::ostream& out, error_code error) noexcept; + +/** + * Exception thrown when an exception-supporting simdjson method is called + */ +struct simdjson_error : public std::exception { + /** + * Create an exception from a simdjson error code. + * @param error The error code + */ + simdjson_error(error_code error) noexcept : _error{error} { } + /** The error message */ + const char *what() const noexcept { return error_message(error()); } + /** The error code */ + error_code error() const noexcept { return _error; } +private: + /** The error code that was used */ + error_code _error; +}; + +namespace internal { + +/** + * The result of a simdjson operation that could fail. + * + * Gives the option of reading error codes, or throwing an exception by casting to the desired result. + * + * This is a base class for implementations that want to add functions to the result type for + * chaining. + * + * Override like: + * + * struct simdjson_result : public internal::simdjson_result_base { + * simdjson_result() noexcept : internal::simdjson_result_base() {} + * simdjson_result(error_code error) noexcept : internal::simdjson_result_base(error) {} + * simdjson_result(T &&value) noexcept : internal::simdjson_result_base(std::forward(value)) {} + * simdjson_result(T &&value, error_code error) noexcept : internal::simdjson_result_base(value, error) {} + * // Your extra methods here + * } + * + * Then any method returning simdjson_result will be chainable with your methods. + */ +template +struct simdjson_result_base : protected std::pair { + + /** + * Create a new empty result with error = UNINITIALIZED. + */ + simdjson_inline simdjson_result_base() noexcept; + + /** + * Create a new error result. + */ + simdjson_inline simdjson_result_base(error_code error) noexcept; + + /** + * Create a new successful result. + */ + simdjson_inline simdjson_result_base(T &&value) noexcept; + + /** + * Create a new result with both things (use if you don't want to branch when creating the result). + */ + simdjson_inline simdjson_result_base(T &&value, error_code error) noexcept; + + /** + * Move the value and the error to the provided variables. + * + * @param value The variable to assign the value to. May not be set if there is an error. + * @param error The variable to assign the error to. Set to SUCCESS if there is no error. + */ + simdjson_inline void tie(T &value, error_code &error) && noexcept; + + /** + * Move the value to the provided variable. + * + * @param value The variable to assign the value to. May not be set if there is an error. + */ + simdjson_inline error_code get(T &value) && noexcept; + + /** + * The error. + */ + simdjson_inline error_code error() const noexcept; + +#if SIMDJSON_EXCEPTIONS + + /** + * Get the result value. + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline T& value() & noexcept(false); + + /** + * Take the result value (move it). + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline T&& value() && noexcept(false); + + /** + * Take the result value (move it). + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline T&& take_value() && noexcept(false); + + /** + * Cast to the value (will throw on error). + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline operator T&&() && noexcept(false); +#endif // SIMDJSON_EXCEPTIONS + + /** + * Get the result value. This function is safe if and only + * the error() method returns a value that evaluates to false. + */ + simdjson_inline const T& value_unsafe() const& noexcept; + + /** + * Take the result value (move it). This function is safe if and only + * the error() method returns a value that evaluates to false. + */ + simdjson_inline T&& value_unsafe() && noexcept; + +}; // struct simdjson_result_base + +} // namespace internal + +/** + * The result of a simdjson operation that could fail. + * + * Gives the option of reading error codes, or throwing an exception by casting to the desired result. + */ +template +struct simdjson_result : public internal::simdjson_result_base { + /** + * @private Create a new empty result with error = UNINITIALIZED. + */ + simdjson_inline simdjson_result() noexcept; + /** + * @private Create a new successful result. + */ + simdjson_inline simdjson_result(T &&value) noexcept; + /** + * @private Create a new error result. + */ + simdjson_inline simdjson_result(error_code error_code) noexcept; + /** + * @private Create a new result with both things (use if you don't want to branch when creating the result). + */ + simdjson_inline simdjson_result(T &&value, error_code error) noexcept; + + /** + * Move the value and the error to the provided variables. + * + * @param value The variable to assign the value to. May not be set if there is an error. + * @param error The variable to assign the error to. Set to SUCCESS if there is no error. + */ + simdjson_inline void tie(T &value, error_code &error) && noexcept; + + /** + * Move the value to the provided variable. + * + * @param value The variable to assign the value to. May not be set if there is an error. + */ + simdjson_warn_unused simdjson_inline error_code get(T &value) && noexcept; + + /** + * The error. + */ + simdjson_inline error_code error() const noexcept; + +#if SIMDJSON_EXCEPTIONS + + /** + * Get the result value. + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline T& value() & noexcept(false); + + /** + * Take the result value (move it). + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline T&& value() && noexcept(false); + + /** + * Take the result value (move it). + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline T&& take_value() && noexcept(false); + + /** + * Cast to the value (will throw on error). + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline operator T&&() && noexcept(false); +#endif // SIMDJSON_EXCEPTIONS + + /** + * Get the result value. This function is safe if and only + * the error() method returns a value that evaluates to false. + */ + simdjson_inline const T& value_unsafe() const& noexcept; + + /** + * Take the result value (move it). This function is safe if and only + * the error() method returns a value that evaluates to false. + */ + simdjson_inline T&& value_unsafe() && noexcept; + +}; // struct simdjson_result + +#if SIMDJSON_EXCEPTIONS + +template +inline std::ostream& operator<<(std::ostream& out, simdjson_result value) { return out << value.value(); } +#endif // SIMDJSON_EXCEPTIONS + +#ifndef SIMDJSON_DISABLE_DEPRECATED_API +/** + * @deprecated This is an alias and will be removed, use error_code instead + */ +using ErrorValues [[deprecated("This is an alias and will be removed, use error_code instead")]] = error_code; + +/** + * @deprecated Error codes should be stored and returned as `error_code`, use `error_message()` instead. + */ +[[deprecated("Error codes should be stored and returned as `error_code`, use `error_message()` instead.")]] +inline const std::string error_message(int error) noexcept; +#endif // SIMDJSON_DISABLE_DEPRECATED_API +} // namespace simdjson + +#endif // SIMDJSON_ERROR_H diff --git a/contrib/libs/simdjson/include/simdjson/fallback.h b/contrib/libs/simdjson/include/simdjson/fallback.h new file mode 100644 index 000000000000..4588cdc00f07 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/fallback.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_FALLBACK_H +#define SIMDJSON_FALLBACK_H + +#include "simdjson/fallback/begin.h" +#include "simdjson/generic/amalgamated.h" +#include "simdjson/fallback/end.h" + +#endif // SIMDJSON_FALLBACK_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/fallback/base.h b/contrib/libs/simdjson/include/simdjson/fallback/base.h new file mode 100644 index 000000000000..99cb37423d76 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/fallback/base.h @@ -0,0 +1,19 @@ +#ifndef SIMDJSON_FALLBACK_BASE_H +#define SIMDJSON_FALLBACK_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +/** + * Fallback implementation (runs on any machine). + */ +namespace fallback { + +class implementation; + +} // namespace fallback +} // namespace simdjson + +#endif // SIMDJSON_FALLBACK_BASE_H diff --git a/contrib/libs/simdjson/include/simdjson/fallback/begin.h b/contrib/libs/simdjson/include/simdjson/fallback/begin.h new file mode 100644 index 000000000000..74749f6d456e --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/fallback/begin.h @@ -0,0 +1,5 @@ +#define SIMDJSON_IMPLEMENTATION fallback +#include "simdjson/fallback/base.h" +#include "simdjson/fallback/bitmanipulation.h" +#include "simdjson/fallback/stringparsing_defs.h" +#include "simdjson/fallback/numberparsing_defs.h" diff --git a/contrib/libs/simdjson/include/simdjson/fallback/bitmanipulation.h b/contrib/libs/simdjson/include/simdjson/fallback/bitmanipulation.h new file mode 100644 index 000000000000..ba47dcccfbf9 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/fallback/bitmanipulation.h @@ -0,0 +1,48 @@ +#ifndef SIMDJSON_FALLBACK_BITMANIPULATION_H +#define SIMDJSON_FALLBACK_BITMANIPULATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/fallback/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace fallback { +namespace { + +#if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64) +static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) { + unsigned long x0 = (unsigned long)x, top, bottom; + _BitScanForward(&top, (unsigned long)(x >> 32)); + _BitScanForward(&bottom, x0); + *ret = x0 ? bottom : 32 + top; + return x != 0; +} +static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) { + unsigned long x1 = (unsigned long)(x >> 32), top, bottom; + _BitScanReverse(&top, x1); + _BitScanReverse(&bottom, (unsigned long)x); + *ret = x1 ? top + 32 : bottom; + return x != 0; +} +#endif + +/* result might be undefined when input_num is zero */ +simdjson_inline int leading_zeroes(uint64_t input_num) { +#ifdef _MSC_VER + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + if (_BitScanReverse64(&leading_zero, input_num)) + return (int)(63 - leading_zero); + else + return 64; +#else + return __builtin_clzll(input_num); +#endif// _MSC_VER +} + +} // unnamed namespace +} // namespace fallback +} // namespace simdjson + +#endif // SIMDJSON_FALLBACK_BITMANIPULATION_H diff --git a/contrib/libs/simdjson/include/simdjson/fallback/end.h b/contrib/libs/simdjson/include/simdjson/fallback/end.h new file mode 100644 index 000000000000..fbd14132b132 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/fallback/end.h @@ -0,0 +1,5 @@ +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/fallback/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#undef SIMDJSON_IMPLEMENTATION diff --git a/contrib/libs/simdjson/include/simdjson/fallback/implementation.h b/contrib/libs/simdjson/include/simdjson/fallback/implementation.h new file mode 100644 index 000000000000..523f06d2e920 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/fallback/implementation.h @@ -0,0 +1,34 @@ +#ifndef SIMDJSON_FALLBACK_IMPLEMENTATION_H +#define SIMDJSON_FALLBACK_IMPLEMENTATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/fallback/base.h" +#include "simdjson/implementation.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace fallback { + +/** + * @private + */ +class implementation final : public simdjson::implementation { +public: + simdjson_inline implementation() : simdjson::implementation( + "fallback", + "Generic fallback implementation", + 0 + ) {} + simdjson_warn_unused error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final; + simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; +}; + +} // namespace fallback +} // namespace simdjson + +#endif // SIMDJSON_FALLBACK_IMPLEMENTATION_H diff --git a/contrib/libs/simdjson/include/simdjson/fallback/numberparsing_defs.h b/contrib/libs/simdjson/include/simdjson/fallback/numberparsing_defs.h new file mode 100644 index 000000000000..e807423e6f87 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/fallback/numberparsing_defs.h @@ -0,0 +1,80 @@ +#ifndef SIMDJSON_FALLBACK_NUMBERPARSING_DEFS_H +#define SIMDJSON_FALLBACK_NUMBERPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/fallback/base.h" +#include "simdjson/internal/numberparsing_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include + +#ifdef JSON_TEST_NUMBERS // for unit testing +void found_invalid_number(const uint8_t *buf); +void found_integer(int64_t result, const uint8_t *buf); +void found_unsigned_integer(uint64_t result, const uint8_t *buf); +void found_float(double result, const uint8_t *buf); +#endif + +namespace simdjson { +namespace fallback { +namespace numberparsing { + +// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ +/** @private */ +static simdjson_inline uint32_t parse_eight_digits_unrolled(const char *chars) { + uint64_t val; + memcpy(&val, chars, sizeof(uint64_t)); + val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8; + val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16; + return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32); +} + +/** @private */ +static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) { + return parse_eight_digits_unrolled(reinterpret_cast(chars)); +} + +#if SIMDJSON_IS_32BITS // _umul128 for x86, arm +// this is a slow emulation routine for 32-bit +// +static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) { + return x * (uint64_t)y; +} +static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { + uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd); + uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd); + uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32)); + uint64_t adbc_carry = !!(adbc < ad); + uint64_t lo = bd + (adbc << 32); + *hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) + + (adbc_carry << 32) + !!(lo < bd); + return lo; +} +#endif + +/** @private */ +simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) { + internal::value128 answer; +#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS +#if SIMDJSON_IS_ARM64 + // ARM64 has native support for 64-bit multiplications, no need to emultate + answer.high = __umulh(value1, value2); + answer.low = value1 * value2; +#else + answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64 +#endif // SIMDJSON_IS_ARM64 +#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS + __uint128_t r = (static_cast<__uint128_t>(value1)) * value2; + answer.low = uint64_t(r); + answer.high = uint64_t(r >> 64); +#endif + return answer; +} + +} // namespace numberparsing +} // namespace fallback +} // namespace simdjson + +#define SIMDJSON_SWAR_NUMBER_PARSING 1 + +#endif // SIMDJSON_FALLBACK_NUMBERPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/fallback/ondemand.h b/contrib/libs/simdjson/include/simdjson/fallback/ondemand.h new file mode 100644 index 000000000000..513b7483fd63 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/fallback/ondemand.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_FALLBACK_ONDEMAND_H +#define SIMDJSON_FALLBACK_ONDEMAND_H + +#include "simdjson/fallback/begin.h" +#include "simdjson/generic/ondemand/amalgamated.h" +#include "simdjson/fallback/end.h" + +#endif // SIMDJSON_FALLBACK_ONDEMAND_H diff --git a/contrib/libs/simdjson/include/simdjson/fallback/stringparsing_defs.h b/contrib/libs/simdjson/include/simdjson/fallback/stringparsing_defs.h new file mode 100644 index 000000000000..64f23c4b03d1 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/fallback/stringparsing_defs.h @@ -0,0 +1,36 @@ +#ifndef SIMDJSON_FALLBACK_STRINGPARSING_DEFS_H +#define SIMDJSON_FALLBACK_STRINGPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/fallback/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace fallback { +namespace { + +// Holds backslashes and quotes locations. +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 1; + simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst); + + simdjson_inline bool has_quote_first() { return c == '"'; } + simdjson_inline bool has_backslash() { return c == '\\'; } + simdjson_inline int quote_index() { return c == '"' ? 0 : 1; } + simdjson_inline int backslash_index() { return c == '\\' ? 0 : 1; } + + uint8_t c; +}; // struct backslash_and_quote + +simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { + // store to dest unconditionally - we can overwrite the bits we don't like later + dst[0] = src[0]; + return { src[0] }; +} + +} // unnamed namespace +} // namespace fallback +} // namespace simdjson + +#endif // SIMDJSON_FALLBACK_STRINGPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/generic/amalgamated.h b/contrib/libs/simdjson/include/simdjson/generic/amalgamated.h new file mode 100644 index 000000000000..4b235fa9f7b1 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/amalgamated.h @@ -0,0 +1,12 @@ +#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_GENERIC_DEPENDENCIES_H) +#error simdjson/generic/dependencies.h must be included before simdjson/generic/amalgamated.h! +#endif + +#include "simdjson/generic/base.h" +#include "simdjson/generic/jsoncharutils.h" +#include "simdjson/generic/atomparsing.h" +#include "simdjson/generic/dom_parser_implementation.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#include "simdjson/generic/numberparsing.h" + +#include "simdjson/generic/implementation_simdjson_result_base-inl.h" diff --git a/contrib/libs/simdjson/include/simdjson/generic/atomparsing.h b/contrib/libs/simdjson/include/simdjson/generic/atomparsing.h new file mode 100644 index 000000000000..2eeb7b81a263 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/atomparsing.h @@ -0,0 +1,77 @@ +#ifndef SIMDJSON_GENERIC_ATOMPARSING_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ATOMPARSING_H +#include "simdjson/generic/base.h" +#include "simdjson/generic/jsoncharutils.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +/// @private +namespace atomparsing { + +// The string_to_uint32 is exclusively used to map literal strings to 32-bit values. +// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot +// be certain that the character pointer will be properly aligned. +// You might think that using memcpy makes this function expensive, but you'd be wrong. +// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false"); +// to the compile-time constant 1936482662. +simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; } + + +// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive. +// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about. +simdjson_warn_unused +simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) { + uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++) + static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes"); + std::memcpy(&srcval, src, sizeof(uint32_t)); + return srcval ^ string_to_uint32(atom); +} + +simdjson_warn_unused +simdjson_inline bool is_valid_true_atom(const uint8_t *src) { + return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0; +} + +simdjson_warn_unused +simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) { + if (len > 4) { return is_valid_true_atom(src); } + else if (len == 4) { return !str4ncmp(src, "true"); } + else { return false; } +} + +simdjson_warn_unused +simdjson_inline bool is_valid_false_atom(const uint8_t *src) { + return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0; +} + +simdjson_warn_unused +simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) { + if (len > 5) { return is_valid_false_atom(src); } + else if (len == 5) { return !str4ncmp(src+1, "alse"); } + else { return false; } +} + +simdjson_warn_unused +simdjson_inline bool is_valid_null_atom(const uint8_t *src) { + return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0; +} + +simdjson_warn_unused +simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) { + if (len > 4) { return is_valid_null_atom(src); } + else if (len == 4) { return !str4ncmp(src, "null"); } + else { return false; } +} + +} // namespace atomparsing +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ATOMPARSING_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/base.h b/contrib/libs/simdjson/include/simdjson/generic/base.h new file mode 100644 index 000000000000..65180837229f --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/base.h @@ -0,0 +1,51 @@ +#ifndef SIMDJSON_GENERIC_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_BASE_H +#include "simdjson/base.h" +// If we haven't got an implementation yet, we're in the editor, editing a generic file! Just +// use the most advanced one we can so the most possible stuff can be tested. +#ifndef SIMDJSON_IMPLEMENTATION +#include "simdjson/implementation_detection.h" +#if SIMDJSON_IMPLEMENTATION_ICELAKE +#include "simdjson/icelake/begin.h" +#elif SIMDJSON_IMPLEMENTATION_HASWELL +#include "simdjson/haswell/begin.h" +#elif SIMDJSON_IMPLEMENTATION_WESTMERE +#include "simdjson/westmere/begin.h" +#elif SIMDJSON_IMPLEMENTATION_ARM64 +#include "simdjson/arm64/begin.h" +#elif SIMDJSON_IMPLEMENTATION_PPC64 +#error #include "simdjson/ppc64/begin.h" +#elif SIMDJSON_IMPLEMENTATION_LSX +#include "simdjson/lsx/begin.h" +#elif SIMDJSON_IMPLEMENTATION_LASX +#include "simdjson/lasx/begin.h" +#elif SIMDJSON_IMPLEMENTATION_FALLBACK +#include "simdjson/fallback/begin.h" +#else +#error "All possible implementations (including fallback) have been disabled! simdjson will not run." +#endif +#endif // SIMDJSON_IMPLEMENTATION +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { + +struct open_container; +class dom_parser_implementation; + +/** + * The type of a JSON number + */ +enum class number_type { + floating_point_number=1, /// a binary64 number + signed_integer, /// a signed integer that fits in a 64-bit word using two's complement + unsigned_integer, /// a positive integer larger or equal to 1<<63 + big_integer /// a big integer that does not fit in a 64-bit word +}; + +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_BASE_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/dependencies.h b/contrib/libs/simdjson/include/simdjson/generic/dependencies.h new file mode 100644 index 000000000000..28d9749f2766 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/dependencies.h @@ -0,0 +1,20 @@ +#ifdef SIMDJSON_CONDITIONAL_INCLUDE +#error simdjson/generic/dependencies.h must be included before defining SIMDJSON_CONDITIONAL_INCLUDE! +#endif + +#ifndef SIMDJSON_GENERIC_DEPENDENCIES_H +#define SIMDJSON_GENERIC_DEPENDENCIES_H + +// Internal headers needed for generics. +// All includes referencing simdjson headers *not* under simdjson/generic must be here! +// Otherwise, amalgamation will fail. +#include "simdjson/base.h" +#include "simdjson/implementation.h" +#include "simdjson/implementation_detection.h" +#include "simdjson/internal/instruction_set.h" +#include "simdjson/internal/dom_parser_implementation.h" +#include "simdjson/internal/jsoncharutils_tables.h" +#include "simdjson/internal/numberparsing_tables.h" +#include "simdjson/internal/simdprune_tables.h" + +#endif // SIMDJSON_GENERIC_DEPENDENCIES_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/dom_parser_implementation.h b/contrib/libs/simdjson/include/simdjson/generic/dom_parser_implementation.h new file mode 100644 index 000000000000..e51d2c1279b4 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/dom_parser_implementation.h @@ -0,0 +1,89 @@ +#ifndef SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H +#include "simdjson/generic/base.h" +#include "simdjson/internal/dom_parser_implementation.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { + +// expectation: sizeof(open_container) = 64/8. +struct open_container { + uint32_t tape_index; // where, on the tape, does the scope ([,{) begins + uint32_t count; // how many elements in the scope +}; // struct open_container + +static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits"); + +class dom_parser_implementation final : public internal::dom_parser_implementation { +public: + /** Tape location of each open { or [ */ + std::unique_ptr open_containers{}; + /** Whether each open container is a [ or { */ + std::unique_ptr is_array{}; + /** Buffer passed to stage 1 */ + const uint8_t *buf{}; + /** Length passed to stage 1 */ + size_t len{0}; + /** Document passed to stage 2 */ + dom::document *doc{}; + + inline dom_parser_implementation() noexcept; + inline dom_parser_implementation(dom_parser_implementation &&other) noexcept; + inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept; + dom_parser_implementation(const dom_parser_implementation &) = delete; + dom_parser_implementation &operator=(const dom_parser_implementation &) = delete; + + simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; + simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final; + simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final; + simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final; + simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept final; + simdjson_warn_unused uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept final; + inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final; + inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final; +private: + simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity); + +}; + +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { + +inline dom_parser_implementation::dom_parser_implementation() noexcept = default; +inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default; +inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default; + +// Leaving these here so they can be inlined if so desired +inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept { + if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; } + // Stage 1 index output + size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7; + structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); + if (!structural_indexes) { _capacity = 0; return MEMALLOC; } + structural_indexes[0] = 0; + n_structural_indexes = 0; + + _capacity = capacity; + return SUCCESS; +} + +inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept { + // Stage 2 stacks + open_containers.reset(new (std::nothrow) open_container[max_depth]); + is_array.reset(new (std::nothrow) bool[max_depth]); + if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; } + + _max_depth = max_depth; + return SUCCESS; +} + +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/implementation_simdjson_result_base-inl.h b/contrib/libs/simdjson/include/simdjson/generic/implementation_simdjson_result_base-inl.h new file mode 100644 index 000000000000..4990ad7ea150 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/implementation_simdjson_result_base-inl.h @@ -0,0 +1,90 @@ +#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H +#include "simdjson/generic/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { + +// +// internal::implementation_simdjson_result_base inline implementation +// + +template +simdjson_inline void implementation_simdjson_result_base::tie(T &value, error_code &error) && noexcept { + error = this->second; + if (!error) { + value = std::forward>(*this).first; + } +} + +template +simdjson_warn_unused simdjson_inline error_code implementation_simdjson_result_base::get(T &value) && noexcept { + error_code error; + std::forward>(*this).tie(value, error); + return error; +} + +template +simdjson_inline error_code implementation_simdjson_result_base::error() const noexcept { + return this->second; +} + +#if SIMDJSON_EXCEPTIONS + +template +simdjson_inline T& implementation_simdjson_result_base::value() & noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return this->first; +} + +template +simdjson_inline T&& implementation_simdjson_result_base::value() && noexcept(false) { + return std::forward>(*this).take_value(); +} + +template +simdjson_inline T&& implementation_simdjson_result_base::take_value() && noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return std::forward(this->first); +} + +template +simdjson_inline implementation_simdjson_result_base::operator T&&() && noexcept(false) { + return std::forward>(*this).take_value(); +} + +#endif // SIMDJSON_EXCEPTIONS + +template +simdjson_inline const T& implementation_simdjson_result_base::value_unsafe() const& noexcept { + return this->first; +} + +template +simdjson_inline T& implementation_simdjson_result_base::value_unsafe() & noexcept { + return this->first; +} + +template +simdjson_inline T&& implementation_simdjson_result_base::value_unsafe() && noexcept { + return std::forward(this->first); +} + +template +simdjson_inline implementation_simdjson_result_base::implementation_simdjson_result_base(T &&value, error_code error) noexcept + : first{std::forward(value)}, second{error} {} +template +simdjson_inline implementation_simdjson_result_base::implementation_simdjson_result_base(error_code error) noexcept + : implementation_simdjson_result_base(T{}, error) {} +template +simdjson_inline implementation_simdjson_result_base::implementation_simdjson_result_base(T &&value) noexcept + : implementation_simdjson_result_base(std::forward(value), SUCCESS) {} + +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/implementation_simdjson_result_base.h b/contrib/libs/simdjson/include/simdjson/generic/implementation_simdjson_result_base.h new file mode 100644 index 000000000000..aaf2bce1267c --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/implementation_simdjson_result_base.h @@ -0,0 +1,134 @@ +#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H +#include "simdjson/generic/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { + +// This is a near copy of include/error.h's implementation_simdjson_result_base, except it doesn't use std::pair +// so we can avoid inlining errors +// TODO reconcile these! +/** + * The result of a simdjson operation that could fail. + * + * Gives the option of reading error codes, or throwing an exception by casting to the desired result. + * + * This is a base class for implementations that want to add functions to the result type for + * chaining. + * + * Override like: + * + * struct simdjson_result : public internal::implementation_simdjson_result_base { + * simdjson_result() noexcept : internal::implementation_simdjson_result_base() {} + * simdjson_result(error_code error) noexcept : internal::implementation_simdjson_result_base(error) {} + * simdjson_result(T &&value) noexcept : internal::implementation_simdjson_result_base(std::forward(value)) {} + * simdjson_result(T &&value, error_code error) noexcept : internal::implementation_simdjson_result_base(value, error) {} + * // Your extra methods here + * } + * + * Then any method returning simdjson_result will be chainable with your methods. + */ +template +struct implementation_simdjson_result_base { + + /** + * Create a new empty result with error = UNINITIALIZED. + */ + simdjson_inline implementation_simdjson_result_base() noexcept = default; + + /** + * Create a new error result. + */ + simdjson_inline implementation_simdjson_result_base(error_code error) noexcept; + + /** + * Create a new successful result. + */ + simdjson_inline implementation_simdjson_result_base(T &&value) noexcept; + + /** + * Create a new result with both things (use if you don't want to branch when creating the result). + */ + simdjson_inline implementation_simdjson_result_base(T &&value, error_code error) noexcept; + + /** + * Move the value and the error to the provided variables. + * + * @param value The variable to assign the value to. May not be set if there is an error. + * @param error The variable to assign the error to. Set to SUCCESS if there is no error. + */ + simdjson_inline void tie(T &value, error_code &error) && noexcept; + + /** + * Move the value to the provided variable. + * + * @param value The variable to assign the value to. May not be set if there is an error. + */ + simdjson_inline error_code get(T &value) && noexcept; + + /** + * The error. + */ + simdjson_inline error_code error() const noexcept; + +#if SIMDJSON_EXCEPTIONS + + /** + * Get the result value. + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline T& value() & noexcept(false); + + /** + * Take the result value (move it). + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline T&& value() && noexcept(false); + + /** + * Take the result value (move it). + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline T&& take_value() && noexcept(false); + + /** + * Cast to the value (will throw on error). + * + * @throw simdjson_error if there was an error. + */ + simdjson_inline operator T&&() && noexcept(false); + + +#endif // SIMDJSON_EXCEPTIONS + + /** + * Get the result value. This function is safe if and only + * the error() method returns a value that evaluates to false. + */ + simdjson_inline const T& value_unsafe() const& noexcept; + /** + * Get the result value. This function is safe if and only + * the error() method returns a value that evaluates to false. + */ + simdjson_inline T& value_unsafe() & noexcept; + /** + * Take the result value (move it). This function is safe if and only + * the error() method returns a value that evaluates to false. + */ + simdjson_inline T&& value_unsafe() && noexcept; +protected: + /** users should never directly access first and second. **/ + T first{}; /** Users should never directly access 'first'. **/ + error_code second{UNINITIALIZED}; /** Users should never directly access 'second'. **/ +}; // struct implementation_simdjson_result_base + +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/jsoncharutils.h b/contrib/libs/simdjson/include/simdjson/generic/jsoncharutils.h new file mode 100644 index 000000000000..a79b1f8507d4 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/jsoncharutils.h @@ -0,0 +1,104 @@ +#ifndef SIMDJSON_GENERIC_JSONCHARUTILS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_JSONCHARUTILS_H +#include "simdjson/generic/base.h" +#include "simdjson/internal/jsoncharutils_tables.h" +#include "simdjson/internal/numberparsing_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace jsoncharutils { + +// return non-zero if not a structural or whitespace char +// zero otherwise +simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) { + return internal::structural_or_whitespace_negated[c]; +} + +simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) { + return internal::structural_or_whitespace[c]; +} + +// returns a value with the high 16 bits set if not valid +// otherwise returns the conversion of the 4 hex digits at src into the bottom +// 16 bits of the 32-bit return register +// +// see +// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/ +static inline uint32_t hex_to_u32_nocheck( + const uint8_t *src) { // strictly speaking, static inline is a C-ism + uint32_t v1 = internal::digit_to_val32[630 + src[0]]; + uint32_t v2 = internal::digit_to_val32[420 + src[1]]; + uint32_t v3 = internal::digit_to_val32[210 + src[2]]; + uint32_t v4 = internal::digit_to_val32[0 + src[3]]; + return v1 | v2 | v3 | v4; +} + +// given a code point cp, writes to c +// the utf-8 code, outputting the length in +// bytes, if the length is zero, the code point +// is invalid +// +// This can possibly be made faster using pdep +// and clz and table lookups, but JSON documents +// have few escaped code points, and the following +// function looks cheap. +// +// Note: we assume that surrogates are treated separately +// +simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) { + if (cp <= 0x7F) { + c[0] = uint8_t(cp); + return 1; // ascii + } + if (cp <= 0x7FF) { + c[0] = uint8_t((cp >> 6) + 192); + c[1] = uint8_t((cp & 63) + 128); + return 2; // universal plane + // Surrogates are treated elsewhere... + //} //else if (0xd800 <= cp && cp <= 0xdfff) { + // return 0; // surrogates // could put assert here + } else if (cp <= 0xFFFF) { + c[0] = uint8_t((cp >> 12) + 224); + c[1] = uint8_t(((cp >> 6) & 63) + 128); + c[2] = uint8_t((cp & 63) + 128); + return 3; + } else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this + // is not needed + c[0] = uint8_t((cp >> 18) + 240); + c[1] = uint8_t(((cp >> 12) & 63) + 128); + c[2] = uint8_t(((cp >> 6) & 63) + 128); + c[3] = uint8_t((cp & 63) + 128); + return 4; + } + // will return 0 when the code point was too large. + return 0; // bad r +} + +#if SIMDJSON_IS_32BITS // _umul128 for x86, arm +// this is a slow emulation routine for 32-bit +// +static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) { + return x * (uint64_t)y; +} +static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { + uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd); + uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd); + uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32)); + uint64_t adbc_carry = !!(adbc < ad); + uint64_t lo = bd + (adbc << 32); + *hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) + + (adbc_carry << 32) + !!(lo < bd); + return lo; +} +#endif + +} // namespace jsoncharutils +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_JSONCHARUTILS_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/numberparsing.h b/contrib/libs/simdjson/include/simdjson/generic/numberparsing.h new file mode 100644 index 000000000000..facc2acc6f91 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/numberparsing.h @@ -0,0 +1,1310 @@ +#ifndef SIMDJSON_GENERIC_NUMBERPARSING_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_NUMBERPARSING_H +#include "simdjson/generic/base.h" +#include "simdjson/generic/jsoncharutils.h" +#include "simdjson/internal/numberparsing_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include +#include +#include + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace numberparsing { + +#ifdef JSON_TEST_NUMBERS +#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR) +#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE))) +#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE))) +#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE))) +#define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR) +#else +#define INVALID_NUMBER(SRC) (NUMBER_ERROR) +#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE)) +#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE)) +#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE)) +#define BIGINT_NUMBER(SRC) (BIGINT_ERROR) +#endif + +namespace { + +// Convert a mantissa, an exponent and a sign bit into an ieee64 double. +// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable). +// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed. +simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) { + double d; + mantissa &= ~(1ULL << 52); + mantissa |= real_exponent << 52; + mantissa |= ((static_cast(negative)) << 63); + std::memcpy(&d, &mantissa, sizeof(d)); + return d; +} + +// Attempts to compute i * 10^(power) exactly; and if "negative" is +// true, negate the result. +// This function will only work in some cases, when it does not work, success is +// set to false. This should work *most of the time* (like 99% of the time). +// We assume that power is in the [smallest_power, +// largest_power] interval: the caller is responsible for this check. +simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) { + // we start with a fast path + // It was described in + // Clinger WD. How to read floating point numbers accurately. + // ACM SIGPLAN Notices. 1990 +#ifndef FLT_EVAL_METHOD +#error "FLT_EVAL_METHOD should be defined, please include cfloat." +#endif +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + // We cannot be certain that x/y is rounded to nearest. + if (0 <= power && power <= 22 && i <= 9007199254740991) +#else + if (-22 <= power && power <= 22 && i <= 9007199254740991) +#endif + { + // convert the integer into a double. This is lossless since + // 0 <= i <= 2^53 - 1. + d = double(i); + // + // The general idea is as follows. + // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then + // 1) Both s and p can be represented exactly as 64-bit floating-point + // values + // (binary64). + // 2) Because s and p can be represented exactly as floating-point values, + // then s * p + // and s / p will produce correctly rounded values. + // + if (power < 0) { + d = d / simdjson::internal::power_of_ten[-power]; + } else { + d = d * simdjson::internal::power_of_ten[power]; + } + if (negative) { + d = -d; + } + return true; + } + // When 22 < power && power < 22 + 16, we could + // hope for another, secondary fast path. It was + // described by David M. Gay in "Correctly rounded + // binary-decimal and decimal-binary conversions." (1990) + // If you need to compute i * 10^(22 + x) for x < 16, + // first compute i * 10^x, if you know that result is exact + // (e.g., when i * 10^x < 2^53), + // then you can still proceed and do (i * 10^x) * 10^22. + // Is this worth your time? + // You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53) + // for this second fast path to work. + // If you you have 22 < power *and* power < 22 + 16, and then you + // optimistically compute "i * 10^(x-22)", there is still a chance that you + // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of + // this optimization maybe less common than we would like. Source: + // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/ + // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html + + // The fast path has now failed, so we are failing back on the slower path. + + // In the slow path, we need to adjust i so that it is > 1<<63 which is always + // possible, except if i == 0, so we handle i == 0 separately. + if(i == 0) { + d = negative ? -0.0 : 0.0; + return true; + } + + + // The exponent is 1024 + 63 + power + // + floor(log(5**power)/log(2)). + // The 1024 comes from the ieee64 standard. + // The 63 comes from the fact that we use a 64-bit word. + // + // Computing floor(log(5**power)/log(2)) could be + // slow. Instead we use a fast function. + // + // For power in (-400,350), we have that + // (((152170 + 65536) * power ) >> 16); + // is equal to + // floor(log(5**power)/log(2)) + power when power >= 0 + // and it is equal to + // ceil(log(5**-power)/log(2)) + power when power < 0 + // + // The 65536 is (1<<16) and corresponds to + // (65536 * power) >> 16 ---> power + // + // ((152170 * power ) >> 16) is equal to + // floor(log(5**power)/log(2)) + // + // Note that this is not magic: 152170/(1<<16) is + // approximatively equal to log(5)/log(2). + // The 1<<16 value is a power of two; we could use a + // larger power of 2 if we wanted to. + // + int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63; + + + // We want the most significant bit of i to be 1. Shift if needed. + int lz = leading_zeroes(i); + i <<= lz; + + + // We are going to need to do some 64-bit arithmetic to get a precise product. + // We use a table lookup approach. + // It is safe because + // power >= smallest_power + // and power <= largest_power + // We recover the mantissa of the power, it has a leading 1. It is always + // rounded down. + // + // We want the most significant 64 bits of the product. We know + // this will be non-zero because the most significant bit of i is + // 1. + const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power); + // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.) + // + // The full_multiplication function computes the 128-bit product of two 64-bit words + // with a returned value of type value128 with a "low component" corresponding to the + // 64-bit least significant bits of the product and with a "high component" corresponding + // to the 64-bit most significant bits of the product. + simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]); + // Both i and power_of_five_128[index] have their most significant bit set to 1 which + // implies that the either the most or the second most significant bit of the product + // is 1. We pack values in this manner for efficiency reasons: it maximizes the use + // we make of the product. It also makes it easy to reason about the product: there + // is 0 or 1 leading zero in the product. + + // Unless the least significant 9 bits of the high (64-bit) part of the full + // product are all 1s, then we know that the most significant 55 bits are + // exact and no further work is needed. Having 55 bits is necessary because + // we need 53 bits for the mantissa but we have to have one rounding bit and + // we can waste a bit if the most significant bit of the product is zero. + if((firstproduct.high & 0x1FF) == 0x1FF) { + // We want to compute i * 5^q, but only care about the top 55 bits at most. + // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing + // the full computation is wasteful. So we do what is called a "truncated + // multiplication". + // We take the most significant 64-bits, and we put them in + // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q + // to the desired approximation using one multiplication. Sometimes it does not suffice. + // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and + // then we get a better approximation to i * 5^q. + // + // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat + // more complicated. + // + // There is an extra layer of complexity in that we need more than 55 bits of + // accuracy in the round-to-even scenario. + // + // The full_multiplication function computes the 128-bit product of two 64-bit words + // with a returned value of type value128 with a "low component" corresponding to the + // 64-bit least significant bits of the product and with a "high component" corresponding + // to the 64-bit most significant bits of the product. + simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); + firstproduct.low += secondproduct.high; + if(secondproduct.high > firstproduct.low) { firstproduct.high++; } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. + } + uint64_t lower = firstproduct.low; + uint64_t upper = firstproduct.high; + // The final mantissa should be 53 bits with a leading 1. + // We shift it so that it occupies 54 bits with a leading 1. + /////// + uint64_t upperbit = upper >> 63; + uint64_t mantissa = upper >> (upperbit + 9); + lz += int(1 ^ upperbit); + + // Here we have mantissa < (1<<54). + int64_t real_exponent = exponent - lz; + if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal? + // Here have that real_exponent <= 0 so -real_exponent >= 0 + if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure. + d = negative ? -0.0 : 0.0; + return true; + } + // next line is safe because -real_exponent + 1 < 0 + mantissa >>= -real_exponent + 1; + // Thankfully, we can't have both "round-to-even" and subnormals because + // "round-to-even" only occurs for powers close to 0. + mantissa += (mantissa & 1); // round up + mantissa >>= 1; + // There is a weird scenario where we don't have a subnormal but just. + // Suppose we start with 2.2250738585072013e-308, we end up + // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal + // whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round + // up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer + // subnormal, but we can only know this after rounding. + // So we only declare a subnormal if we are smaller than the threshold. + real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1; + d = to_double(mantissa, real_exponent, negative); + return true; + } + // We have to round to even. The "to even" part + // is only a problem when we are right in between two floats + // which we guard against. + // If we have lots of trailing zeros, we may fall right between two + // floating-point values. + // + // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54] + // times a power of two. That is, it is right between a number with binary significand + // m and another number with binary significand m+1; and it must be the case + // that it cannot be represented by a float itself. + // + // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p. + // Recall that 10^q = 5^q * 2^q. + // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that + // 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23. + // When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so + // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have + // 2^{53} x 5^{-q} < 2^{64}. + // Hence we have 5^{-q} < 2^{11}$ or q>= -4. + // + // We require lower <= 1 and not lower == 0 because we could not prove that + // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test. + if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) { + if((mantissa << (upperbit + 64 - 53 - 2)) == upper) { + mantissa &= ~1; // flip it so that we do not round up + } + } + + mantissa += mantissa & 1; + mantissa >>= 1; + + // Here we have mantissa < (1<<53), unless there was an overflow + if (mantissa >= (1ULL << 53)) { + ////////// + // This will happen when parsing values such as 7.2057594037927933e+16 + //////// + mantissa = (1ULL << 52); + real_exponent++; + } + mantissa &= ~(1ULL << 52); + // we have to check that real_exponent is in range, otherwise we bail out + if (simdjson_unlikely(real_exponent > 2046)) { + // We have an infinite value!!! We could actually throw an error here if we could. + return false; + } + d = to_double(mantissa, real_exponent, negative); + return true; +} + +// We call a fallback floating-point parser that might be slow. Note +// it will accept JSON numbers, but the JSON spec. is more restrictive so +// before you call parse_float_fallback, you need to have validated the input +// string with the JSON grammar. +// It will return an error (false) if the parsed number is infinite. +// The string parsing itself always succeeds. We know that there is at least +// one digit. +static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) { + *outDouble = simdjson::internal::from_chars(reinterpret_cast(ptr)); + // We do not accept infinite values. + + // Detecting finite values in a portable manner is ridiculously hard, ideally + // we would want to do: + // return !std::isfinite(*outDouble); + // but that mysteriously fails under legacy/old libc++ libraries, see + // https://github.com/simdjson/simdjson/issues/1286 + // + // Therefore, fall back to this solution (the extra parens are there + // to handle that max may be a macro on windows). + return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); +} + +static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) { + *outDouble = simdjson::internal::from_chars(reinterpret_cast(ptr), reinterpret_cast(end_ptr)); + // We do not accept infinite values. + + // Detecting finite values in a portable manner is ridiculously hard, ideally + // we would want to do: + // return !std::isfinite(*outDouble); + // but that mysteriously fails under legacy/old libc++ libraries, see + // https://github.com/simdjson/simdjson/issues/1286 + // + // Therefore, fall back to this solution (the extra parens are there + // to handle that max may be a macro on windows). + return !(*outDouble > (std::numeric_limits::max)() || *outDouble < std::numeric_limits::lowest()); +} + +// check quickly whether the next 8 chars are made of digits +// at a glance, it looks better than Mula's +// http://0x80.pl/articles/swar-digits-validate.html +simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) { + uint64_t val; + // this can read up to 7 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7"); + std::memcpy(&val, chars, 8); + // a branchy method might be faster: + // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030) + // && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) == + // 0x3030303030303030); + return (((val & 0xF0F0F0F0F0F0F0F0) | + (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) == + 0x3333333333333333); +} + +template +SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later +simdjson_inline bool parse_digit(const uint8_t c, I &i) { + const uint8_t digit = static_cast(c - '0'); + if (digit > 9) { + return false; + } + // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication + i = 10 * i + digit; // might overflow, we will handle the overflow later + return true; +} + +simdjson_inline bool is_digit(const uint8_t c) { + return static_cast(c - '0') <= 9; +} + +simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) { + // we continue with the fiction that we have an integer. If the + // floating point number is representable as x * 10^z for some integer + // z that fits in 53 bits, then we will be able to convert back the + // the integer into a float in a lossless manner. + const uint8_t *const first_after_period = p; + +#ifdef SIMDJSON_SWAR_NUMBER_PARSING +#if SIMDJSON_SWAR_NUMBER_PARSING + // this helps if we have lots of decimals! + // this turns out to be frequent enough. + if (is_made_of_eight_digits_fast(p)) { + i = i * 100000000 + parse_eight_digits_unrolled(p); + p += 8; + } +#endif // SIMDJSON_SWAR_NUMBER_PARSING +#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING + // Unrolling the first digit makes a small difference on some implementations (e.g. westmere) + if (parse_digit(*p, i)) { ++p; } + while (parse_digit(*p, i)) { p++; } + exponent = first_after_period - p; + // Decimal without digits (123.) is illegal + if (exponent == 0) { + return INVALID_NUMBER(src); + } + return SUCCESS; +} + +simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) { + // Exp Sign: -123.456e[-]78 + bool neg_exp = ('-' == *p); + if (neg_exp || '+' == *p) { p++; } // Skip + as well + + // Exponent: -123.456e-[78] + auto start_exp = p; + int64_t exp_number = 0; + while (parse_digit(*p, exp_number)) { ++p; } + // It is possible for parse_digit to overflow. + // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN. + // Thus we *must* check for possible overflow before we negate exp_number. + + // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into + // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may + // not oblige and may, in fact, generate two distinct paths in any case. It might be + // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off + // instructions for a simdjson_likely branch, an unconclusive gain. + + // If there were no digits, it's an error. + if (simdjson_unlikely(p == start_exp)) { + return INVALID_NUMBER(src); + } + // We have a valid positive exponent in exp_number at this point, except that + // it may have overflowed. + + // If there were more than 18 digits, we may have overflowed the integer. We have to do + // something!!!! + if (simdjson_unlikely(p > start_exp+18)) { + // Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow + while (*start_exp == '0') { start_exp++; } + // 19 digits could overflow int64_t and is kind of absurd anyway. We don't + // support exponents smaller than -999,999,999,999,999,999 and bigger + // than 999,999,999,999,999,999. + // We can truncate. + // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before + // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could + // truncate at 324. + // Note that there is no reason to fail per se at this point in time. + // E.g., 0e999999999999999999999 is a fine number. + if (p > start_exp+18) { exp_number = 999999999999999999; } + } + // At this point, we know that exp_number is a sane, positive, signed integer. + // It is <= 999,999,999,999,999,999. As long as 'exponent' is in + // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent' + // is bounded in magnitude by the size of the JSON input, we are fine in this universe. + // To sum it up: the next line should never overflow. + exponent += (neg_exp ? -exp_number : exp_number); + return SUCCESS; +} + +simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) { + const uint8_t *const srcend = src + max_length; + bool negative = (*src == '-'); // we can always read at least one character after the '-' + const uint8_t *p = src + uint8_t(negative); + if(p == srcend) { return false; } + if(*p == '0') { + ++p; + if(p == srcend) { return true; } + if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; } + return true; + } + while(p != srcend && is_digit(*p)) { ++p; } + if(p == srcend) { return true; } + if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; } + return true; +} + +simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) { + // It is possible that the integer had an overflow. + // We have to handle the case where we have 0.0000somenumber. + const uint8_t *start = start_digits; + while ((*start == '0') || (*start == '.')) { ++start; } + // we over-decrement by one when there is a '.' + return digit_count - size_t(start - start_digits); +} + +} // unnamed namespace + +/** @private */ +static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) { + if (parse_float_fallback(src, answer)) { + return SUCCESS; + } + return INVALID_NUMBER(src); +} + +/** @private */ +template +simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) { + // If we frequently had to deal with long strings of digits, + // we could extend our code by using a 128-bit integer instead + // of a 64-bit integer. However, this is uncommon in practice. + // + // 9999999999999999999 < 2**64 so we can accommodate 19 digits. + // If we have a decimal separator, then digit_count - 1 is the number of digits, but we + // may not have a decimal separator! + if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) { + // Ok, chances are good that we had an overflow! + // this is almost never going to get called!!! + // we start anew, going slowly!!! + // This will happen in the following examples: + // 10000000000000000000000000000000000000000000e+308 + // 3.1415926535897932384626433832795028841971693993751 + // + // NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer + // reference to it, it would force it to be stored in memory, preventing the compiler from + // picking it apart and putting into registers. i.e. if we pass it as reference, + // it gets slow. + double d; + error_code error = slow_float_parsing(src, &d); + writer.append_double(d); + return error; + } + // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other + // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331 + // To future reader: we'd love if someone found a better way, or at least could explain this result! + if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) { + // + // Important: smallest_power is such that it leads to a zero value. + // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero + // so something x 10^-343 goes to zero, but not so with something x 10^-342. + static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough"); + // + if((exponent < simdjson::internal::smallest_power) || (i == 0)) { + // E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero + WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer); + return SUCCESS; + } else { // (exponent > largest_power) and (i != 0) + // We have, for sure, an infinite value and simdjson refuses to parse infinite values. + return INVALID_NUMBER(src); + } + } + double d; + if (!compute_float_64(exponent, i, negative, d)) { + // we are almost never going to get here. + if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); } + } + WRITE_DOUBLE(d, src, writer); + return SUCCESS; +} + +// parse the number at src +// define JSON_TEST_NUMBERS for unit testing +// +// It is assumed that the number is followed by a structural ({,},],[) character +// or a white space character. If that is not the case (e.g., when the JSON +// document is made of a single number), then it is necessary to copy the +// content and append a space before calling this function. +// +// Our objective is accurate parsing (ULP of 0) at high speed. +template +simdjson_inline error_code parse_number(const uint8_t *const src, W &writer); + +// for performance analysis, it is sometimes useful to skip parsing +#ifdef SIMDJSON_SKIPNUMBERPARSING + +template +simdjson_inline error_code parse_number(const uint8_t *const, W &writer) { + writer.append_s64(0); // always write zero + return SUCCESS; // always succeeds +} + +simdjson_unused simdjson_inline simdjson_result parse_unsigned(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_inline simdjson_result parse_integer(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_inline simdjson_result parse_double(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_inline simdjson_result parse_integer_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_inline simdjson_result parse_double_in_string(const uint8_t * const src) noexcept { return 0; } +simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; } +simdjson_unused simdjson_inline simdjson_result is_integer(const uint8_t * src) noexcept { return false; } +simdjson_unused simdjson_inline simdjson_result get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; } +#else + +// parse the number at src +// define JSON_TEST_NUMBERS for unit testing +// +// It is assumed that the number is followed by a structural ({,},],[) character +// or a white space character. If that is not the case (e.g., when the JSON +// document is made of a single number), then it is necessary to copy the +// content and append a space before calling this function. +// +// Our objective is accurate parsing (ULP of 0) at high speed. +template +simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) { + + // + // Check for minus sign + // + bool negative = (*src == '-'); + const uint8_t *p = src + uint8_t(negative); + + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); } + + // + // Handle floats if there is a . or e (or both) + // + int64_t exponent = 0; + bool is_float = false; + if ('.' == *p) { + is_float = true; + ++p; + SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) ); + digit_count = int(p - start_digits); // used later to guard against overflows + } + if (('e' == *p) || ('E' == *p)) { + is_float = true; + ++p; + SIMDJSON_TRY( parse_exponent(src, p, exponent) ); + } + if (is_float) { + const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p); + SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) ); + if (dirty_end) { return INVALID_NUMBER(src); } + return SUCCESS; + } + + // The longest negative 64-bit number is 19 digits. + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + size_t longest_digit_count = negative ? 19 : 20; + if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); } + if (digit_count == longest_digit_count) { + if (negative) { + // Anything negative above INT64_MAX+1 is invalid + if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); } + WRITE_INTEGER(~i+1, src, writer); + if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); } + return SUCCESS; + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to INT64_MAX. + // + } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); } + } + + // Write unsigned if it does not fit in a signed integer. + if (i > uint64_t(INT64_MAX)) { + WRITE_UNSIGNED(i, src, writer); + } else { + WRITE_INTEGER(negative ? (~i+1) : i, src, writer); + } + if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); } + return SUCCESS; +} + +// Inlineable functions +namespace { + +// This table can be used to characterize the final character of an integer +// string. For JSON structural character and allowable white space characters, +// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise +// we return NUMBER_ERROR. +// Optimization note: we could easily reduce the size of the table by half (to 128) +// at the cost of an extra branch. +// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits): +static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast"); +static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast"); +static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast"); + +const uint8_t integer_string_finisher[256] = { + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, + SUCCESS, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, + NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, SUCCESS, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, + SUCCESS, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, + NUMBER_ERROR}; + +// Parse any number from 0 to 18,446,744,073,709,551,615 +simdjson_unused simdjson_inline simdjson_result parse_unsigned(const uint8_t * const src) noexcept { + const uint8_t *p = src; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to INT64_MAX. + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + + +// Parse any number from 0 to 18,446,744,073,709,551,615 +// Never read at src_end or beyond +simdjson_unused simdjson_inline simdjson_result parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept { + const uint8_t *p = src; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to INT64_MAX. + // + if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + +// Parse any number from 0 to 18,446,744,073,709,551,615 +simdjson_unused simdjson_inline simdjson_result parse_unsigned_in_string(const uint8_t * const src) noexcept { + const uint8_t *p = src + 1; + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // The longest positive 64-bit number is 20 digits. + // We do it this way so we don't trigger this branch unless we must. + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > 20)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if (*p != '"') { return NUMBER_ERROR; } + + if (digit_count == 20) { + // Positive overflow check: + // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the + // biggest uint64_t. + // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX. + // If we got here, it's a 20 digit number starting with the digit "1". + // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller + // than 1,553,255,926,290,448,384. + // - That is smaller than the smallest possible 20-digit number the user could write: + // 10,000,000,000,000,000,000. + // - Therefore, if the number is positive and lower than that, it's overflow. + // - The value we are looking at is less than or equal to INT64_MAX. + // + // Note: we use src[1] and not src[0] because src[0] is the quote character in this + // instance. + if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; } + } + + return i; +} + +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +simdjson_unused simdjson_inline simdjson_result parse_integer(const uint8_t *src) noexcept { + // + // Check for minus sign + // + bool negative = (*src == '-'); + const uint8_t *p = src + uint8_t(negative); + + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while (parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} + +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +// Never read at src_end or beyond +simdjson_unused simdjson_inline simdjson_result parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept { + // + // Check for minus sign + // + if(src == src_end) { return NUMBER_ERROR; } + bool negative = (*src == '-'); + const uint8_t *p = src + uint8_t(negative); + + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = p; + uint64_t i = 0; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(p - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*p)) { + // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} + +// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 +simdjson_unused simdjson_inline simdjson_result parse_integer_in_string(const uint8_t *src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + src += uint8_t(negative) + 1; + + // + // Parse the integer part. + // + // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare + const uint8_t *const start_digits = src; + uint64_t i = 0; + while (parse_digit(*src, i)) { src++; } + + // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error. + // Optimization note: size_t is expected to be unsigned. + size_t digit_count = size_t(src - start_digits); + // We go from + // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + // so we can never represent numbers that have more than 19 digits. + size_t longest_digit_count = 19; + // Optimization note: the compiler can probably merge + // ((digit_count == 0) || (digit_count > longest_digit_count)) + // into a single branch since digit_count is unsigned. + if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; } + // Here digit_count > 0. + if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; } + // We can do the following... + // if (!jsoncharutils::is_structural_or_whitespace(*src)) { + // return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR; + // } + // as a single table lookup: + if(*src != '"') { return NUMBER_ERROR; } + // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX. + // Performance note: This check is only needed when digit_count == longest_digit_count but it is + // so cheap that we might as well always make it. + if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; } + return negative ? (~i+1) : i; +} + +simdjson_unused simdjson_inline simdjson_result parse_double(const uint8_t * src) noexcept { + // + // Check for minus sign + // + bool negative = (*src == '-'); + src += uint8_t(negative); + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while (parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely(*p == '.')) { + p++; + const uint8_t *start_decimal_digits = p; + if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while (parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = p-start_digits > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if (*p == 'e' || *p == 'E') { + p++; + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while (parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src - uint8_t(negative), &d)) { + return NUMBER_ERROR; + } + return d; +} + +simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { + return (*src == '-'); +} + +simdjson_unused simdjson_inline simdjson_result is_integer(const uint8_t * src) noexcept { + bool negative = (*src == '-'); + src += uint8_t(negative); + const uint8_t *p = src; + while(static_cast(*p - '0') <= 9) { p++; } + if ( p == src ) { return NUMBER_ERROR; } + if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; } + return false; +} + +simdjson_unused simdjson_inline simdjson_result get_number_type(const uint8_t * src) noexcept { + bool negative = (*src == '-'); + src += uint8_t(negative); + const uint8_t *p = src; + while(static_cast(*p - '0') <= 9) { p++; } + size_t digit_count = size_t(p - src); + if ( p == src ) { return NUMBER_ERROR; } + if (jsoncharutils::is_structural_or_whitespace(*p)) { + static const uint8_t * smaller_big_integer = reinterpret_cast("9223372036854775808"); + // We have an integer. + if(simdjson_unlikely(digit_count > 20)) { + return number_type::big_integer; + } + // If the number is negative and valid, it must be a signed integer. + if(negative) { + if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer; + if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) { + return number_type::big_integer; + } + return number_type::signed_integer; + } + // Let us check if we have a big integer (>=2**64). + static const uint8_t * two_to_sixtyfour = reinterpret_cast("18446744073709551616"); + if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) { + return number_type::big_integer; + } + // The number is positive and smaller than 18446744073709551616 (or 2**64). + // We want values larger or equal to 9223372036854775808 to be unsigned + // integers, and the other values to be signed integers. + if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) { + return number_type::unsigned_integer; + } + return number_type::signed_integer; + } + // Hopefully, we have 'e' or 'E' or '.'. + return number_type::floating_point_number; +} + +// Never read at src_end or beyond +simdjson_unused simdjson_inline simdjson_result parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept { + if(src == src_end) { return NUMBER_ERROR; } + // + // Check for minus sign + // + bool negative = (*src == '-'); + src += uint8_t(negative); + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + if(p == src_end) { return NUMBER_ERROR; } + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while ((p != src_end) && parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely((p != src_end) && (*p == '.'))) { + p++; + const uint8_t *start_decimal_digits = p; + if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while ((p != src_end) && parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = start_digits-src > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if ((p != src_end) && (*p == 'e' || *p == 'E')) { + p++; + if(p == src_end) { return NUMBER_ERROR; } + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while ((p != src_end) && parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) { + return NUMBER_ERROR; + } + return d; +} + +simdjson_unused simdjson_inline simdjson_result parse_double_in_string(const uint8_t * src) noexcept { + // + // Check for minus sign + // + bool negative = (*(src + 1) == '-'); + src += uint8_t(negative) + 1; + + // + // Parse the integer part. + // + uint64_t i = 0; + const uint8_t *p = src; + p += parse_digit(*p, i); + bool leading_zero = (i == 0); + while (parse_digit(*p, i)) { p++; } + // no integer digits, or 0123 (zero must be solo) + if ( p == src ) { return INCORRECT_TYPE; } + if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; } + + // + // Parse the decimal part. + // + int64_t exponent = 0; + bool overflow; + if (simdjson_likely(*p == '.')) { + p++; + const uint8_t *start_decimal_digits = p; + if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits + p++; + while (parse_digit(*p, i)) { p++; } + exponent = -(p - start_decimal_digits); + + // Overflow check. More than 19 digits (minus the decimal) may be overflow. + overflow = p-src-1 > 19; + if (simdjson_unlikely(overflow && leading_zero)) { + // Skip leading 0.00000 and see if it still overflows + const uint8_t *start_digits = src + 2; + while (*start_digits == '0') { start_digits++; } + overflow = p-start_digits > 19; + } + } else { + overflow = p-src > 19; + } + + // + // Parse the exponent + // + if (*p == 'e' || *p == 'E') { + p++; + bool exp_neg = *p == '-'; + p += exp_neg || *p == '+'; + + uint64_t exp = 0; + const uint8_t *start_exp_digits = p; + while (parse_digit(*p, exp)) { p++; } + // no exp digits, or 20+ exp digits + if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; } + + exponent += exp_neg ? 0-exp : exp; + } + + if (*p != '"') { return NUMBER_ERROR; } + + overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power; + + // + // Assemble (or slow-parse) the float + // + double d; + if (simdjson_likely(!overflow)) { + if (compute_float_64(exponent, i, negative, d)) { return d; } + } + if (!parse_float_fallback(src - uint8_t(negative), &d)) { + return NUMBER_ERROR; + } + return d; +} + +} // unnamed namespace +#endif // SIMDJSON_SKIPNUMBERPARSING + +} // namespace numberparsing + +inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept { + switch (type) { + case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break; + case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break; + case number_type::floating_point_number: out << "floating-point number (binary64)"; break; + case number_type::big_integer: out << "big integer"; break; + default: SIMDJSON_UNREACHABLE(); + } + return out; +} + +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_NUMBERPARSING_H diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/amalgamated.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/amalgamated.h new file mode 100644 index 000000000000..d53e7316df71 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/amalgamated.h @@ -0,0 +1,42 @@ +#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_GENERIC_ONDEMAND_DEPENDENCIES_H) +#error simdjson/generic/ondemand/dependencies.h must be included before simdjson/generic/ondemand/amalgamated.h! +#endif + +// Stuff other things depend on +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/value_iterator.h" +#include "simdjson/generic/ondemand/value.h" +#include "simdjson/generic/ondemand/logger.h" +#include "simdjson/generic/ondemand/token_iterator.h" +#include "simdjson/generic/ondemand/json_iterator.h" +#include "simdjson/generic/ondemand/json_type.h" +#include "simdjson/generic/ondemand/raw_json_string.h" +#include "simdjson/generic/ondemand/parser.h" + +// All other declarations +#include "simdjson/generic/ondemand/array.h" +#include "simdjson/generic/ondemand/array_iterator.h" +#include "simdjson/generic/ondemand/document.h" +#include "simdjson/generic/ondemand/document_stream.h" +#include "simdjson/generic/ondemand/field.h" +#include "simdjson/generic/ondemand/object.h" +#include "simdjson/generic/ondemand/object_iterator.h" +#include "simdjson/generic/ondemand/serialization.h" + +// Inline definitions +#include "simdjson/generic/ondemand/array-inl.h" +#include "simdjson/generic/ondemand/array_iterator-inl.h" +#include "simdjson/generic/ondemand/document-inl.h" +#include "simdjson/generic/ondemand/document_stream-inl.h" +#include "simdjson/generic/ondemand/field-inl.h" +#include "simdjson/generic/ondemand/json_iterator-inl.h" +#include "simdjson/generic/ondemand/json_type-inl.h" +#include "simdjson/generic/ondemand/logger-inl.h" +#include "simdjson/generic/ondemand/object-inl.h" +#include "simdjson/generic/ondemand/object_iterator-inl.h" +#include "simdjson/generic/ondemand/parser-inl.h" +#include "simdjson/generic/ondemand/raw_json_string-inl.h" +#include "simdjson/generic/ondemand/serialization-inl.h" +#include "simdjson/generic/ondemand/token_iterator-inl.h" +#include "simdjson/generic/ondemand/value-inl.h" +#include "simdjson/generic/ondemand/value_iterator-inl.h" diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/array-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/array-inl.h new file mode 100644 index 000000000000..b699ea4a0379 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/array-inl.h @@ -0,0 +1,283 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_ARRAY_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_ARRAY_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/array.h" +#include "simdjson/generic/ondemand/array_iterator-inl.h" +#include "simdjson/generic/ondemand/json_iterator.h" +#include "simdjson/generic/ondemand/value.h" +#include "simdjson/generic/ondemand/value_iterator-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +// +// ### Live States +// +// While iterating or looking up values, depth >= iter->depth. at_start may vary. Error is +// always SUCCESS: +// +// - Start: This is the state when the array is first found and the iterator is just past the `{`. +// In this state, at_start == true. +// - Next: After we hand a scalar value to the user, or an array/object which they then fully +// iterate over, the iterator is at the `,` before the next value (or `]`). In this state, +// depth == iter->depth, at_start == false, and error == SUCCESS. +// - Unfinished Business: When we hand an array/object to the user which they do not fully +// iterate over, we need to finish that iteration by skipping child values until we reach the +// Next state. In this state, depth > iter->depth, at_start == false, and error == SUCCESS. +// +// ## Error States +// +// In error states, we will yield exactly one more value before stopping. iter->depth == depth +// and at_start is always false. We decrement after yielding the error, moving to the Finished +// state. +// +// - Chained Error: When the array iterator is part of an error chain--for example, in +// `for (auto tweet : doc["tweets"])`, where the tweet element may be missing or not be an +// array--we yield that error in the loop, exactly once. In this state, error != SUCCESS and +// iter->depth == depth, and at_start == false. We decrement depth when we yield the error. +// - Missing Comma Error: When the iterator ++ method discovers there is no comma between elements, +// we flag that as an error and treat it exactly the same as a Chained Error. In this state, +// error == TAPE_ERROR, iter->depth == depth, and at_start == false. +// +// ## Terminal State +// +// The terminal state has iter->depth < depth. at_start is always false. +// +// - Finished: When we have reached a `]` or have reported an error, we are finished. We signal this +// by decrementing depth. In this state, iter->depth < depth, at_start == false, and +// error == SUCCESS. +// + +simdjson_inline array::array(const value_iterator &_iter) noexcept + : iter{_iter} +{ +} + +simdjson_inline simdjson_result array::start(value_iterator &iter) noexcept { + // We don't need to know if the array is empty to start iteration, but we do want to know if there + // is an error--thus `simdjson_unused`. + simdjson_unused bool has_value; + SIMDJSON_TRY( iter.start_array().get(has_value) ); + return array(iter); +} +simdjson_inline simdjson_result array::start_root(value_iterator &iter) noexcept { + simdjson_unused bool has_value; + SIMDJSON_TRY( iter.start_root_array().get(has_value) ); + return array(iter); +} +simdjson_inline simdjson_result array::started(value_iterator &iter) noexcept { + bool has_value; + SIMDJSON_TRY(iter.started_array().get(has_value)); + return array(iter); +} + +simdjson_inline simdjson_result array::begin() noexcept { +#if SIMDJSON_DEVELOPMENT_CHECKS + if (!iter.is_at_iterator_start()) { return OUT_OF_ORDER_ITERATION; } +#endif + return array_iterator(iter); +} +simdjson_inline simdjson_result array::end() noexcept { + return array_iterator(iter); +} +simdjson_inline error_code array::consume() noexcept { + auto error = iter.json_iter().skip_child(iter.depth()-1); + if(error) { iter.abandon(); } + return error; +} + +simdjson_inline simdjson_result array::raw_json() noexcept { + const uint8_t * starting_point{iter.peek_start()}; + auto error = consume(); + if(error) { return error; } + // After 'consume()', we could be left pointing just beyond the document, but that + // is ok because we are not going to dereference the final pointer position, we just + // use it to compute the length in bytes. + const uint8_t * final_point{iter._json_iter->unsafe_pointer()}; + return std::string_view(reinterpret_cast(starting_point), size_t(final_point - starting_point)); +} + +SIMDJSON_PUSH_DISABLE_WARNINGS +SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING +simdjson_inline simdjson_result array::count_elements() & noexcept { + size_t count{0}; + // Important: we do not consume any of the values. + for(simdjson_unused auto v : *this) { count++; } + // The above loop will always succeed, but we want to report errors. + if(iter.error()) { return iter.error(); } + // We need to move back at the start because we expect users to iterate through + // the array after counting the number of elements. + iter.reset_array(); + return count; +} +SIMDJSON_POP_DISABLE_WARNINGS + +simdjson_inline simdjson_result array::is_empty() & noexcept { + bool is_not_empty; + auto error = iter.reset_array().get(is_not_empty); + if(error) { return error; } + return !is_not_empty; +} + +inline simdjson_result array::reset() & noexcept { + return iter.reset_array(); +} + +inline simdjson_result array::at_pointer(std::string_view json_pointer) noexcept { + if (json_pointer[0] != '/') { return INVALID_JSON_POINTER; } + json_pointer = json_pointer.substr(1); + // - means "the append position" or "the element after the end of the array" + // We don't support this, because we're returning a real element, not a position. + if (json_pointer == "-") { return INDEX_OUT_OF_BOUNDS; } + + // Read the array index + size_t array_index = 0; + size_t i; + for (i = 0; i < json_pointer.length() && json_pointer[i] != '/'; i++) { + uint8_t digit = uint8_t(json_pointer[i] - '0'); + // Check for non-digit in array index. If it's there, we're trying to get a field in an object + if (digit > 9) { return INCORRECT_TYPE; } + array_index = array_index*10 + digit; + } + + // 0 followed by other digits is invalid + if (i > 1 && json_pointer[0] == '0') { return INVALID_JSON_POINTER; } // "JSON pointer array index has other characters after 0" + + // Empty string is invalid; so is a "/" with no digits before it + if (i == 0) { return INVALID_JSON_POINTER; } // "Empty string in JSON pointer array index" + // Get the child + auto child = at(array_index); + // If there is an error, it ends here + if(child.error()) { + return child; + } + + // If there is a /, we're not done yet, call recursively. + if (i < json_pointer.length()) { + child = child.at_pointer(json_pointer.substr(i)); + } + return child; +} + +inline std::string json_path_to_pointer_conversion(std::string_view json_path) { + if (json_path.empty() || (json_path.front() != '.' && + json_path.front() != '[')) { + return "-1"; // This is just a sentinel value, the caller should check for this and return an error. + } + + std::string result; + // Reserve space to reduce allocations, adjusting for potential increases due + // to escaping. + result.reserve(json_path.size() * 2); + + size_t i = 0; + + while (i < json_path.length()) { + if (json_path[i] == '.') { + result += '/'; + } else if (json_path[i] == '[') { + result += '/'; + ++i; // Move past the '[' + while (i < json_path.length() && json_path[i] != ']') { + if (json_path[i] == '~') { + result += "~0"; + } else if (json_path[i] == '/') { + result += "~1"; + } else { + result += json_path[i]; + } + ++i; + } + if (i == json_path.length() || json_path[i] != ']') { + return "-1"; // Using sentinel value that will be handled as an error by the caller. + } + } else { + if (json_path[i] == '~') { + result += "~0"; + } else if (json_path[i] == '/') { + result += "~1"; + } else { + result += json_path[i]; + } + } + ++i; + } + + return result; +} + +inline simdjson_result array::at_path(std::string_view json_path) noexcept { + auto json_pointer = json_path_to_pointer_conversion(json_path); + if (json_pointer == "-1") { return INVALID_JSON_POINTER; } + return at_pointer(json_pointer); +} + +simdjson_inline simdjson_result array::at(size_t index) noexcept { + size_t i = 0; + for (auto value : *this) { + if (i == index) { return value; } + i++; + } + return INDEX_OUT_OF_BOUNDS; +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result( + SIMDJSON_IMPLEMENTATION::ondemand::array &&value +) noexcept + : implementation_simdjson_result_base( + std::forward(value) + ) +{ +} +simdjson_inline simdjson_result::simdjson_result( + error_code error +) noexcept + : implementation_simdjson_result_base(error) +{ +} + +simdjson_inline simdjson_result simdjson_result::begin() noexcept { + if (error()) { return error(); } + return first.begin(); +} +simdjson_inline simdjson_result simdjson_result::end() noexcept { + if (error()) { return error(); } + return first.end(); +} +simdjson_inline simdjson_result simdjson_result::count_elements() & noexcept { + if (error()) { return error(); } + return first.count_elements(); +} +simdjson_inline simdjson_result simdjson_result::is_empty() & noexcept { + if (error()) { return error(); } + return first.is_empty(); +} +simdjson_inline simdjson_result simdjson_result::at(size_t index) noexcept { + if (error()) { return error(); } + return first.at(index); +} +simdjson_inline simdjson_result simdjson_result::at_pointer(std::string_view json_pointer) noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} +simdjson_inline simdjson_result simdjson_result::at_path(std::string_view json_path) noexcept { + if (error()) { return error(); } + return first.at_path(json_path); +} +simdjson_inline simdjson_result simdjson_result::raw_json() noexcept { + if (error()) { return error(); } + return first.raw_json(); +} +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_ARRAY_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/array.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/array.h new file mode 100644 index 000000000000..e6095d27c240 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/array.h @@ -0,0 +1,216 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_ARRAY_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_ARRAY_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#include "simdjson/generic/ondemand/value_iterator.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * A forward-only JSON array. + */ +class array { +public: + /** + * Create a new invalid array. + * + * Exists so you can declare a variable and later assign to it before use. + */ + simdjson_inline array() noexcept = default; + + /** + * Begin array iteration. + * + * Part of the std::iterable interface. + */ + simdjson_inline simdjson_result begin() noexcept; + /** + * Sentinel representing the end of the array. + * + * Part of the std::iterable interface. + */ + simdjson_inline simdjson_result end() noexcept; + /** + * This method scans the array and counts the number of elements. + * The count_elements method should always be called before you have begun + * iterating through the array: it is expected that you are pointing at + * the beginning of the array. + * The runtime complexity is linear in the size of the array. After + * calling this function, if successful, the array is 'rewinded' at its + * beginning as if it had never been accessed. If the JSON is malformed (e.g., + * there is a missing comma), then an error is returned and it is no longer + * safe to continue. + * + * To check that an array is empty, it is more performant to use + * the is_empty() method. + */ + simdjson_inline simdjson_result count_elements() & noexcept; + /** + * This method scans the beginning of the array and checks whether the + * array is empty. + * The runtime complexity is constant time. After + * calling this function, if successful, the array is 'rewinded' at its + * beginning as if it had never been accessed. If the JSON is malformed (e.g., + * there is a missing comma), then an error is returned and it is no longer + * safe to continue. + */ + simdjson_inline simdjson_result is_empty() & noexcept; + /** + * Reset the iterator so that we are pointing back at the + * beginning of the array. You should still consume values only once even if you + * can iterate through the array more than once. If you unescape a string + * within the array more than once, you have unsafe code. Note that rewinding + * an array means that you may need to reparse it anew: it is not a free + * operation. + * + * @returns true if the array contains some elements (not empty) + */ + inline simdjson_result reset() & noexcept; + /** + * Get the value associated with the given JSON pointer. We use the RFC 6901 + * https://tools.ietf.org/html/rfc6901 standard, interpreting the current node + * as the root of its own JSON document. + * + * ondemand::parser parser; + * auto json = R"([ { "foo": { "a": [ 10, 20, 30 ] }} ])"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("/0/foo/a/1") == 20 + * + * Note that at_pointer() called on the document automatically calls the document's rewind + * method between each call. It invalidates all previously accessed arrays, objects and values + * that have not been consumed. Yet it is not the case when calling at_pointer on an array + * instance: there is no rewind and no invalidation. + * + * You may only call at_pointer on an array after it has been created, but before it has + * been first accessed. When calling at_pointer on an array, the pointer is advanced to + * the location indicated by the JSON pointer (in case of success). It is no longer possible + * to call at_pointer on the same array. + * + * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching. + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + */ + inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + + /** + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key + * names and array indices. + * + * https://datatracker.ietf.org/doc/html/draft-normington-jsonpath-00 + * + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + */ + inline simdjson_result at_path(std::string_view json_path) noexcept; + + /** + * Consumes the array and returns a string_view instance corresponding to the + * array as represented in JSON. It points inside the original document. + */ + simdjson_inline simdjson_result raw_json() noexcept; + + /** + * Get the value at the given index. This function has linear-time complexity. + * This function should only be called once on an array instance since the array iterator is not reset between each call. + * + * @return The value at the given index, or: + * - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length + */ + simdjson_inline simdjson_result at(size_t index) noexcept; +protected: + /** + * Go to the end of the array, no matter where you are right now. + */ + simdjson_inline error_code consume() noexcept; + + /** + * Begin array iteration. + * + * @param iter The iterator. Must be where the initial [ is expected. Will be *moved* into the + * resulting array. + * @error INCORRECT_TYPE if the iterator is not at [. + */ + static simdjson_inline simdjson_result start(value_iterator &iter) noexcept; + /** + * Begin array iteration from the root. + * + * @param iter The iterator. Must be where the initial [ is expected. Will be *moved* into the + * resulting array. + * @error INCORRECT_TYPE if the iterator is not at [. + * @error TAPE_ERROR if there is no closing ] at the end of the document. + */ + static simdjson_inline simdjson_result start_root(value_iterator &iter) noexcept; + /** + * Begin array iteration. + * + * This version of the method should be called after the initial [ has been verified, and is + * intended for use by switch statements that check the type of a value. + * + * @param iter The iterator. Must be after the initial [. Will be *moved* into the resulting array. + */ + static simdjson_inline simdjson_result started(value_iterator &iter) noexcept; + + /** + * Create an array at the given Internal array creation. Call array::start() or array::started() instead of this. + * + * @param iter The iterator. Must either be at the start of the first element with iter.is_alive() + * == true, or past the [] with is_alive() == false if the array is empty. Will be *moved* + * into the resulting array. + */ + simdjson_inline array(const value_iterator &iter) noexcept; + + /** + * Iterator marking current position. + * + * iter.is_alive() == false indicates iteration is complete. + */ + value_iterator iter{}; + + friend class value; + friend class document; + friend struct simdjson_result; + friend struct simdjson_result; + friend class array_iterator; +}; + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::array &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; + + simdjson_inline simdjson_result begin() noexcept; + simdjson_inline simdjson_result end() noexcept; + inline simdjson_result count_elements() & noexcept; + inline simdjson_result is_empty() & noexcept; + inline simdjson_result reset() & noexcept; + simdjson_inline simdjson_result at(size_t index) noexcept; + simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + simdjson_inline simdjson_result at_path(std::string_view json_path) noexcept; + simdjson_inline simdjson_result raw_json() noexcept; + +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_ARRAY_H diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/array_iterator-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/array_iterator-inl.h new file mode 100644 index 000000000000..6e4ba8140dea --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/array_iterator-inl.h @@ -0,0 +1,78 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_ARRAY_ITERATOR_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_ARRAY_ITERATOR_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/array_iterator.h" +#include "simdjson/generic/ondemand/value-inl.h" +#include "simdjson/generic/ondemand/value_iterator-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +simdjson_inline array_iterator::array_iterator(const value_iterator &_iter) noexcept + : iter{_iter} +{} + +simdjson_inline simdjson_result array_iterator::operator*() noexcept { + if (iter.error()) { iter.abandon(); return iter.error(); } + return value(iter.child()); +} +simdjson_inline bool array_iterator::operator==(const array_iterator &other) const noexcept { + return !(*this != other); +} +simdjson_inline bool array_iterator::operator!=(const array_iterator &) const noexcept { + return iter.is_open(); +} +simdjson_inline array_iterator &array_iterator::operator++() noexcept { + error_code error; + // PERF NOTE this is a safety rail ... users should exit loops as soon as they receive an error, so we'll never get here. + // However, it does not seem to make a perf difference, so we add it out of an abundance of caution. + if (( error = iter.error() )) { return *this; } + if (( error = iter.skip_child() )) { return *this; } + if (( error = iter.has_next_element().error() )) { return *this; } + return *this; +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result( + SIMDJSON_IMPLEMENTATION::ondemand::array_iterator &&value +) noexcept + : SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base(std::forward(value)) +{ + first.iter.assert_is_valid(); +} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base({}, error) +{ +} + +simdjson_inline simdjson_result simdjson_result::operator*() noexcept { + if (error()) { return error(); } + return *first; +} +simdjson_inline bool simdjson_result::operator==(const simdjson_result &other) const noexcept { + if (!first.iter.is_valid()) { return !error(); } + return first == other.first; +} +simdjson_inline bool simdjson_result::operator!=(const simdjson_result &other) const noexcept { + if (!first.iter.is_valid()) { return error(); } + return first != other.first; +} +simdjson_inline simdjson_result &simdjson_result::operator++() noexcept { + // Clear the error if there is one, so we don't yield it twice + if (error()) { second = SUCCESS; return *this; } + ++(first); + return *this; +} + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_ARRAY_ITERATOR_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/array_iterator.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/array_iterator.h new file mode 100644 index 000000000000..0957be9c79cd --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/array_iterator.h @@ -0,0 +1,96 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_ARRAY_ITERATOR_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_ARRAY_ITERATOR_H +#include "simdjson/generic/implementation_simdjson_result_base.h" +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/value_iterator.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * A forward-only JSON array. + * + * This is an input_iterator, meaning: + * - It is forward-only + * - * must be called exactly once per element. + * - ++ must be called exactly once in between each * (*, ++, *, ++, * ...) + */ +class array_iterator { +public: + /** Create a new, invalid array iterator. */ + simdjson_inline array_iterator() noexcept = default; + + // + // Iterator interface + // + + /** + * Get the current element. + * + * Part of the std::iterator interface. + */ + simdjson_inline simdjson_result operator*() noexcept; // MUST ONLY BE CALLED ONCE PER ITERATION. + /** + * Check if we are at the end of the JSON. + * + * Part of the std::iterator interface. + * + * @return true if there are no more elements in the JSON array. + */ + simdjson_inline bool operator==(const array_iterator &) const noexcept; + /** + * Check if there are more elements in the JSON array. + * + * Part of the std::iterator interface. + * + * @return true if there are more elements in the JSON array. + */ + simdjson_inline bool operator!=(const array_iterator &) const noexcept; + /** + * Move to the next element. + * + * Part of the std::iterator interface. + */ + simdjson_inline array_iterator &operator++() noexcept; + +private: + value_iterator iter{}; + + simdjson_inline array_iterator(const value_iterator &iter) noexcept; + + friend class array; + friend class value; + friend struct simdjson_result; +}; + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::array_iterator &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; + + // + // Iterator interface + // + + simdjson_inline simdjson_result operator*() noexcept; // MUST ONLY BE CALLED ONCE PER ITERATION. + simdjson_inline bool operator==(const simdjson_result &) const noexcept; + simdjson_inline bool operator!=(const simdjson_result &) const noexcept; + simdjson_inline simdjson_result &operator++() noexcept; +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_ARRAY_ITERATOR_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/base.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/base.h new file mode 100644 index 000000000000..89bd0c01bba0 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/base.h @@ -0,0 +1,47 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_BASE_H +#include "simdjson/generic/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +/** + * A fast, simple, DOM-like interface that parses JSON as you use it. + * + * Designed for maximum speed and a lower memory profile. + */ +namespace ondemand { + +/** Represents the depth of a JSON value (number of nested arrays/objects). */ +using depth_t = int32_t; + +/** @copydoc simdjson::SIMDJSON_IMPLEMENTATION::number_type */ +using number_type = simdjson::SIMDJSON_IMPLEMENTATION::number_type; + +/** @private Position in the JSON buffer indexes */ +using token_position = const uint32_t *; + +class array; +class array_iterator; +class document; +class document_reference; +class document_stream; +class field; +class json_iterator; +enum class json_type; +struct number; +class object; +class object_iterator; +class parser; +class raw_json_string; +class token_iterator; +class value; +class value_iterator; + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_BASE_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/dependencies.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/dependencies.h new file mode 100644 index 000000000000..7fd9da72ef26 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/dependencies.h @@ -0,0 +1,17 @@ +#ifdef SIMDJSON_CONDITIONAL_INCLUDE +#error simdjson/generic/ondemand/dependencies.h must be included before defining SIMDJSON_CONDITIONAL_INCLUDE! +#endif + +#ifndef SIMDJSON_GENERIC_ONDEMAND_DEPENDENCIES_H +#define SIMDJSON_GENERIC_ONDEMAND_DEPENDENCIES_H + +// Internal headers needed for ondemand generics. +// All includes not under simdjson/generic/ondemand must be here! +// Otherwise, amalgamation will fail. +#include "simdjson/dom/base.h" // for MINIMAL_DOCUMENT_CAPACITY +#include "simdjson/implementation.h" +#include "simdjson/padded_string.h" +#include "simdjson/padded_string_view.h" +#include "simdjson/internal/dom_parser_implementation.h" + +#endif // SIMDJSON_GENERIC_ONDEMAND_DEPENDENCIES_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/document-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/document-inl.h new file mode 100644 index 000000000000..3af60b087701 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/document-inl.h @@ -0,0 +1,917 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/array_iterator.h" +#include "simdjson/generic/ondemand/document.h" +#include "simdjson/generic/ondemand/json_type.h" +#include "simdjson/generic/ondemand/raw_json_string.h" +#include "simdjson/generic/ondemand/value.h" +#include "simdjson/generic/ondemand/array-inl.h" +#include "simdjson/generic/ondemand/json_iterator-inl.h" +#include "simdjson/generic/ondemand/object-inl.h" +#include "simdjson/generic/ondemand/value_iterator-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +simdjson_inline document::document(ondemand::json_iterator &&_iter) noexcept + : iter{std::forward(_iter)} +{ + logger::log_start_value(iter, "document"); +} + +simdjson_inline document document::start(json_iterator &&iter) noexcept { + return document(std::forward(iter)); +} + +inline void document::rewind() noexcept { + iter.rewind(); +} + +inline std::string document::to_debug_string() noexcept { + return iter.to_string(); +} + +inline simdjson_result document::current_location() const noexcept { + return iter.current_location(); +} + +inline int32_t document::current_depth() const noexcept { + return iter.depth(); +} + +inline bool document::at_end() const noexcept { + return iter.at_end(); +} + + +inline bool document::is_alive() noexcept { + return iter.is_alive(); +} +simdjson_inline value_iterator document::resume_value_iterator() noexcept { + return value_iterator(&iter, 1, iter.root_position()); +} +simdjson_inline value_iterator document::get_root_value_iterator() noexcept { + return resume_value_iterator(); +} +simdjson_inline simdjson_result document::start_or_resume_object() noexcept { + if (iter.at_root()) { + return get_object(); + } else { + return object::resume(resume_value_iterator()); + } +} +simdjson_inline simdjson_result document::get_value() noexcept { + // Make sure we start any arrays or objects before returning, so that start_root_() + // gets called. + + // It is the convention throughout the code that the macro `SIMDJSON_DEVELOPMENT_CHECKS` determines whether + // we check for OUT_OF_ORDER_ITERATION. Proper on::demand code should never trigger this error. +#if SIMDJSON_DEVELOPMENT_CHECKS + if (!iter.at_root()) { return OUT_OF_ORDER_ITERATION; } +#endif + // assert_at_root() serves two purposes: in Debug mode, whether or not + // SIMDJSON_DEVELOPMENT_CHECKS is set or not, it checks that we are at the root of + // the document (this will typically be redundant). In release mode, it generates + // SIMDJSON_ASSUME statements to allow the compiler to make assumptions. + iter.assert_at_root(); + switch (*iter.peek()) { + case '[': { + // The following lines check that the document ends with ]. + auto value_iterator = get_root_value_iterator(); + auto error = value_iterator.check_root_array(); + if(error) { return error; } + return value(get_root_value_iterator()); + } + case '{': { + // The following lines would check that the document ends with }. + auto value_iterator = get_root_value_iterator(); + auto error = value_iterator.check_root_object(); + if(error) { return error; } + return value(get_root_value_iterator()); + } + default: + // Unfortunately, scalar documents are a special case in simdjson and they cannot + // be safely converted to value instances. + return SCALAR_DOCUMENT_AS_VALUE; + } +} +simdjson_inline simdjson_result document::get_array() & noexcept { + auto value = get_root_value_iterator(); + return array::start_root(value); +} +simdjson_inline simdjson_result document::get_object() & noexcept { + auto value = get_root_value_iterator(); + return object::start_root(value); +} + +/** + * We decided that calling 'get_double()' on the JSON document '1.233 blabla' should + * give an error, so we check for trailing content. We want to disallow trailing + * content. + * Thus, in several implementations below, we pass a 'true' parameter value to + * a get_root_value_iterator() method: this indicates that we disallow trailing content. + */ + +simdjson_inline simdjson_result document::get_uint64() noexcept { + return get_root_value_iterator().get_root_uint64(true); +} +simdjson_inline simdjson_result document::get_uint64_in_string() noexcept { + return get_root_value_iterator().get_root_uint64_in_string(true); +} +simdjson_inline simdjson_result document::get_int64() noexcept { + return get_root_value_iterator().get_root_int64(true); +} +simdjson_inline simdjson_result document::get_int64_in_string() noexcept { + return get_root_value_iterator().get_root_int64_in_string(true); +} +simdjson_inline simdjson_result document::get_double() noexcept { + return get_root_value_iterator().get_root_double(true); +} +simdjson_inline simdjson_result document::get_double_in_string() noexcept { + return get_root_value_iterator().get_root_double_in_string(true); +} +simdjson_inline simdjson_result document::get_string(bool allow_replacement) noexcept { + return get_root_value_iterator().get_root_string(true, allow_replacement); +} +template +simdjson_inline error_code document::get_string(string_type& receiver, bool allow_replacement) noexcept { + return get_root_value_iterator().get_root_string(receiver, true, allow_replacement); +} +simdjson_inline simdjson_result document::get_wobbly_string() noexcept { + return get_root_value_iterator().get_root_wobbly_string(true); +} +simdjson_inline simdjson_result document::get_raw_json_string() noexcept { + return get_root_value_iterator().get_root_raw_json_string(true); +} +simdjson_inline simdjson_result document::get_bool() noexcept { + return get_root_value_iterator().get_root_bool(true); +} +simdjson_inline simdjson_result document::is_null() noexcept { + return get_root_value_iterator().is_root_null(true); +} + +template<> simdjson_inline simdjson_result document::get() & noexcept { return get_array(); } +template<> simdjson_inline simdjson_result document::get() & noexcept { return get_object(); } +template<> simdjson_inline simdjson_result document::get() & noexcept { return get_raw_json_string(); } +template<> simdjson_inline simdjson_result document::get() & noexcept { return get_string(false); } +template<> simdjson_inline simdjson_result document::get() & noexcept { return get_double(); } +template<> simdjson_inline simdjson_result document::get() & noexcept { return get_uint64(); } +template<> simdjson_inline simdjson_result document::get() & noexcept { return get_int64(); } +template<> simdjson_inline simdjson_result document::get() & noexcept { return get_bool(); } +template<> simdjson_inline simdjson_result document::get() & noexcept { return get_value(); } + +template<> simdjson_deprecated simdjson_inline simdjson_result document::get() && noexcept { return get_raw_json_string(); } +template<> simdjson_deprecated simdjson_inline simdjson_result document::get() && noexcept { return get_string(false); } +template<> simdjson_deprecated simdjson_inline simdjson_result document::get() && noexcept { return std::forward(*this).get_double(); } +template<> simdjson_deprecated simdjson_inline simdjson_result document::get() && noexcept { return std::forward(*this).get_uint64(); } +template<> simdjson_deprecated simdjson_inline simdjson_result document::get() && noexcept { return std::forward(*this).get_int64(); } +template<> simdjson_deprecated simdjson_inline simdjson_result document::get() && noexcept { return std::forward(*this).get_bool(); } +template<> simdjson_deprecated simdjson_inline simdjson_result document::get() && noexcept { return get_value(); } + +template simdjson_inline error_code document::get(T &out) & noexcept { + return get().get(out); +} +template simdjson_deprecated simdjson_inline error_code document::get(T &out) && noexcept { + return std::forward(*this).get().get(out); +} + +#if SIMDJSON_EXCEPTIONS +template +simdjson_deprecated simdjson_inline document::operator T() && noexcept(false) { return get(); } +template +simdjson_inline document::operator T() & noexcept(false) { return get(); } +simdjson_inline document::operator array() & noexcept(false) { return get_array(); } +simdjson_inline document::operator object() & noexcept(false) { return get_object(); } +simdjson_inline document::operator uint64_t() noexcept(false) { return get_uint64(); } +simdjson_inline document::operator int64_t() noexcept(false) { return get_int64(); } +simdjson_inline document::operator double() noexcept(false) { return get_double(); } +simdjson_inline document::operator std::string_view() noexcept(false) { return get_string(false); } +simdjson_inline document::operator raw_json_string() noexcept(false) { return get_raw_json_string(); } +simdjson_inline document::operator bool() noexcept(false) { return get_bool(); } +simdjson_inline document::operator value() noexcept(false) { return get_value(); } + +#endif +simdjson_inline simdjson_result document::count_elements() & noexcept { + auto a = get_array(); + simdjson_result answer = a.count_elements(); + /* If there was an array, we are now left pointing at its first element. */ + if(answer.error() == SUCCESS) { rewind(); } + return answer; +} +simdjson_inline simdjson_result document::count_fields() & noexcept { + auto a = get_object(); + simdjson_result answer = a.count_fields(); + /* If there was an object, we are now left pointing at its first element. */ + if(answer.error() == SUCCESS) { rewind(); } + return answer; +} +simdjson_inline simdjson_result document::at(size_t index) & noexcept { + auto a = get_array(); + return a.at(index); +} +simdjson_inline simdjson_result document::begin() & noexcept { + return get_array().begin(); +} +simdjson_inline simdjson_result document::end() & noexcept { + return {}; +} + +simdjson_inline simdjson_result document::find_field(std::string_view key) & noexcept { + return start_or_resume_object().find_field(key); +} +simdjson_inline simdjson_result document::find_field(const char *key) & noexcept { + return start_or_resume_object().find_field(key); +} +simdjson_inline simdjson_result document::find_field_unordered(std::string_view key) & noexcept { + return start_or_resume_object().find_field_unordered(key); +} +simdjson_inline simdjson_result document::find_field_unordered(const char *key) & noexcept { + return start_or_resume_object().find_field_unordered(key); +} +simdjson_inline simdjson_result document::operator[](std::string_view key) & noexcept { + return start_or_resume_object()[key]; +} +simdjson_inline simdjson_result document::operator[](const char *key) & noexcept { + return start_or_resume_object()[key]; +} + +simdjson_inline error_code document::consume() noexcept { + auto error = iter.skip_child(0); + if(error) { iter.abandon(); } + return error; +} + +simdjson_inline simdjson_result document::raw_json() noexcept { + auto _iter = get_root_value_iterator(); + const uint8_t * starting_point{_iter.peek_start()}; + auto error = consume(); + if(error) { return error; } + // After 'consume()', we could be left pointing just beyond the document, but that + // is ok because we are not going to dereference the final pointer position, we just + // use it to compute the length in bytes. + const uint8_t * final_point{iter.unsafe_pointer()}; + return std::string_view(reinterpret_cast(starting_point), size_t(final_point - starting_point)); +} + +simdjson_inline simdjson_result document::type() noexcept { + return get_root_value_iterator().type(); +} + +simdjson_inline simdjson_result document::is_scalar() noexcept { + json_type this_type; + auto error = type().get(this_type); + if(error) { return error; } + return ! ((this_type == json_type::array) || (this_type == json_type::object)); +} + +simdjson_inline simdjson_result document::is_string() noexcept { + json_type this_type; + auto error = type().get(this_type); + if(error) { return error; } + return (this_type == json_type::string); +} + +simdjson_inline bool document::is_negative() noexcept { + return get_root_value_iterator().is_root_negative(); +} + +simdjson_inline simdjson_result document::is_integer() noexcept { + return get_root_value_iterator().is_root_integer(true); +} + +simdjson_inline simdjson_result document::get_number_type() noexcept { + return get_root_value_iterator().get_root_number_type(true); +} + +simdjson_inline simdjson_result document::get_number() noexcept { + return get_root_value_iterator().get_root_number(true); +} + + +simdjson_inline simdjson_result document::raw_json_token() noexcept { + auto _iter = get_root_value_iterator(); + return std::string_view(reinterpret_cast(_iter.peek_start()), _iter.peek_root_length()); +} + +simdjson_inline simdjson_result document::at_pointer(std::string_view json_pointer) noexcept { + rewind(); // Rewind the document each time at_pointer is called + if (json_pointer.empty()) { + return this->get_value(); + } + json_type t; + SIMDJSON_TRY(type().get(t)); + switch (t) + { + case json_type::array: + return (*this).get_array().at_pointer(json_pointer); + case json_type::object: + return (*this).get_object().at_pointer(json_pointer); + default: + return INVALID_JSON_POINTER; + } +} + +simdjson_inline simdjson_result document::at_path(std::string_view json_path) noexcept { + rewind(); // Rewind the document each time at_pointer is called + if (json_path.empty()) { + return this->get_value(); + } + json_type t; + SIMDJSON_TRY(type().get(t)); + switch (t) { + case json_type::array: + return (*this).get_array().at_path(json_path); + case json_type::object: + return (*this).get_object().at_path(json_path); + default: + return INVALID_JSON_POINTER; + } +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result( + SIMDJSON_IMPLEMENTATION::ondemand::document &&value +) noexcept : + implementation_simdjson_result_base( + std::forward(value) + ) +{ +} +simdjson_inline simdjson_result::simdjson_result( + error_code error +) noexcept : + implementation_simdjson_result_base( + error + ) +{ +} +simdjson_inline simdjson_result simdjson_result::count_elements() & noexcept { + if (error()) { return error(); } + return first.count_elements(); +} +simdjson_inline simdjson_result simdjson_result::count_fields() & noexcept { + if (error()) { return error(); } + return first.count_fields(); +} +simdjson_inline simdjson_result simdjson_result::at(size_t index) & noexcept { + if (error()) { return error(); } + return first.at(index); +} +simdjson_inline error_code simdjson_result::rewind() noexcept { + if (error()) { return error(); } + first.rewind(); + return SUCCESS; +} +simdjson_inline simdjson_result simdjson_result::begin() & noexcept { + if (error()) { return error(); } + return first.begin(); +} +simdjson_inline simdjson_result simdjson_result::end() & noexcept { + return {}; +} +simdjson_inline simdjson_result simdjson_result::find_field_unordered(std::string_view key) & noexcept { + if (error()) { return error(); } + return first.find_field_unordered(key); +} +simdjson_inline simdjson_result simdjson_result::find_field_unordered(const char *key) & noexcept { + if (error()) { return error(); } + return first.find_field_unordered(key); +} +simdjson_inline simdjson_result simdjson_result::operator[](std::string_view key) & noexcept { + if (error()) { return error(); } + return first[key]; +} +simdjson_inline simdjson_result simdjson_result::operator[](const char *key) & noexcept { + if (error()) { return error(); } + return first[key]; +} +simdjson_inline simdjson_result simdjson_result::find_field(std::string_view key) & noexcept { + if (error()) { return error(); } + return first.find_field(key); +} +simdjson_inline simdjson_result simdjson_result::find_field(const char *key) & noexcept { + if (error()) { return error(); } + return first.find_field(key); +} +simdjson_inline simdjson_result simdjson_result::get_array() & noexcept { + if (error()) { return error(); } + return first.get_array(); +} +simdjson_inline simdjson_result simdjson_result::get_object() & noexcept { + if (error()) { return error(); } + return first.get_object(); +} +simdjson_inline simdjson_result simdjson_result::get_uint64() noexcept { + if (error()) { return error(); } + return first.get_uint64(); +} +simdjson_inline simdjson_result simdjson_result::get_uint64_in_string() noexcept { + if (error()) { return error(); } + return first.get_uint64_in_string(); +} +simdjson_inline simdjson_result simdjson_result::get_int64() noexcept { + if (error()) { return error(); } + return first.get_int64(); +} +simdjson_inline simdjson_result simdjson_result::get_int64_in_string() noexcept { + if (error()) { return error(); } + return first.get_int64_in_string(); +} +simdjson_inline simdjson_result simdjson_result::get_double() noexcept { + if (error()) { return error(); } + return first.get_double(); +} +simdjson_inline simdjson_result simdjson_result::get_double_in_string() noexcept { + if (error()) { return error(); } + return first.get_double_in_string(); +} +simdjson_inline simdjson_result simdjson_result::get_string(bool allow_replacement) noexcept { + if (error()) { return error(); } + return first.get_string(allow_replacement); +} +template +simdjson_inline error_code simdjson_result::get_string(string_type& receiver, bool allow_replacement) noexcept { + if (error()) { return error(); } + return first.get_string(receiver, allow_replacement); +} +simdjson_inline simdjson_result simdjson_result::get_wobbly_string() noexcept { + if (error()) { return error(); } + return first.get_wobbly_string(); +} +simdjson_inline simdjson_result simdjson_result::get_raw_json_string() noexcept { + if (error()) { return error(); } + return first.get_raw_json_string(); +} +simdjson_inline simdjson_result simdjson_result::get_bool() noexcept { + if (error()) { return error(); } + return first.get_bool(); +} +simdjson_inline simdjson_result simdjson_result::get_value() noexcept { + if (error()) { return error(); } + return first.get_value(); +} +simdjson_inline simdjson_result simdjson_result::is_null() noexcept { + if (error()) { return error(); } + return first.is_null(); +} + +template +simdjson_inline simdjson_result simdjson_result::get() & noexcept { + if (error()) { return error(); } + return first.get(); +} +template +simdjson_deprecated simdjson_inline simdjson_result simdjson_result::get() && noexcept { + if (error()) { return error(); } + return std::forward(first).get(); +} +template +simdjson_inline error_code simdjson_result::get(T &out) & noexcept { + if (error()) { return error(); } + return first.get(out); +} +template +simdjson_inline error_code simdjson_result::get(T &out) && noexcept { + if (error()) { return error(); } + return std::forward(first).get(out); +} + +template<> simdjson_inline simdjson_result simdjson_result::get() & noexcept = delete; +template<> simdjson_deprecated simdjson_inline simdjson_result simdjson_result::get() && noexcept { + if (error()) { return error(); } + return std::forward(first); +} +template<> simdjson_inline error_code simdjson_result::get(SIMDJSON_IMPLEMENTATION::ondemand::document &out) & noexcept = delete; +template<> simdjson_inline error_code simdjson_result::get(SIMDJSON_IMPLEMENTATION::ondemand::document &out) && noexcept { + if (error()) { return error(); } + out = std::forward(first); + return SUCCESS; +} + +simdjson_inline simdjson_result simdjson_result::type() noexcept { + if (error()) { return error(); } + return first.type(); +} + +simdjson_inline simdjson_result simdjson_result::is_scalar() noexcept { + if (error()) { return error(); } + return first.is_scalar(); +} + +simdjson_inline simdjson_result simdjson_result::is_string() noexcept { + if (error()) { return error(); } + return first.is_string(); +} + +simdjson_inline bool simdjson_result::is_negative() noexcept { + if (error()) { return error(); } + return first.is_negative(); +} + +simdjson_inline simdjson_result simdjson_result::is_integer() noexcept { + if (error()) { return error(); } + return first.is_integer(); +} + +simdjson_inline simdjson_result simdjson_result::get_number_type() noexcept { + if (error()) { return error(); } + return first.get_number_type(); +} + +simdjson_inline simdjson_result simdjson_result::get_number() noexcept { + if (error()) { return error(); } + return first.get_number(); +} + + +#if SIMDJSON_EXCEPTIONS +template ::value == false>::type> +simdjson_inline simdjson_result::operator T() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator SIMDJSON_IMPLEMENTATION::ondemand::array() & noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator SIMDJSON_IMPLEMENTATION::ondemand::object() & noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator uint64_t() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator int64_t() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator double() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator std::string_view() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator SIMDJSON_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator bool() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator SIMDJSON_IMPLEMENTATION::ondemand::value() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +#endif + + +simdjson_inline simdjson_result simdjson_result::current_location() noexcept { + if (error()) { return error(); } + return first.current_location(); +} + +simdjson_inline bool simdjson_result::at_end() const noexcept { + if (error()) { return error(); } + return first.at_end(); +} + + +simdjson_inline int32_t simdjson_result::current_depth() const noexcept { + if (error()) { return error(); } + return first.current_depth(); +} + +simdjson_inline simdjson_result simdjson_result::raw_json_token() noexcept { + if (error()) { return error(); } + return first.raw_json_token(); +} + +simdjson_inline simdjson_result simdjson_result::at_pointer(std::string_view json_pointer) noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} + +simdjson_inline simdjson_result simdjson_result::at_path(std::string_view json_path) noexcept { + if (error()) { return error(); } + return first.at_path(json_path); +} + +} // namespace simdjson + + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +simdjson_inline document_reference::document_reference() noexcept : doc{nullptr} {} +simdjson_inline document_reference::document_reference(document &d) noexcept : doc(&d) {} +simdjson_inline void document_reference::rewind() noexcept { doc->rewind(); } +simdjson_inline simdjson_result document_reference::get_array() & noexcept { return doc->get_array(); } +simdjson_inline simdjson_result document_reference::get_object() & noexcept { return doc->get_object(); } +/** + * The document_reference instances are used primarily/solely for streams of JSON + * documents. + * We decided that calling 'get_double()' on the JSON document '1.233 blabla' should + * give an error, so we check for trailing content. + * + * However, for streams of JSON documents, we want to be able to start from + * "321" "321" "321" + * and parse it successfully as a stream of JSON documents, calling get_uint64_in_string() + * successfully each time. + * + * To achieve this result, we pass a 'false' to a get_root_value_iterator() method: + * this indicates that we allow trailing content. + */ +simdjson_inline simdjson_result document_reference::get_uint64() noexcept { return doc->get_root_value_iterator().get_root_uint64(false); } +simdjson_inline simdjson_result document_reference::get_uint64_in_string() noexcept { return doc->get_root_value_iterator().get_root_uint64_in_string(false); } +simdjson_inline simdjson_result document_reference::get_int64() noexcept { return doc->get_root_value_iterator().get_root_int64(false); } +simdjson_inline simdjson_result document_reference::get_int64_in_string() noexcept { return doc->get_root_value_iterator().get_root_int64_in_string(false); } +simdjson_inline simdjson_result document_reference::get_double() noexcept { return doc->get_root_value_iterator().get_root_double(false); } +simdjson_inline simdjson_result document_reference::get_double_in_string() noexcept { return doc->get_root_value_iterator().get_root_double(false); } +simdjson_inline simdjson_result document_reference::get_string(bool allow_replacement) noexcept { return doc->get_root_value_iterator().get_root_string(false, allow_replacement); } +template +simdjson_inline error_code document_reference::get_string(string_type& receiver, bool allow_replacement) noexcept { return doc->get_root_value_iterator().get_root_string(receiver, false, allow_replacement); } +simdjson_inline simdjson_result document_reference::get_wobbly_string() noexcept { return doc->get_root_value_iterator().get_root_wobbly_string(false); } +simdjson_inline simdjson_result document_reference::get_raw_json_string() noexcept { return doc->get_root_value_iterator().get_root_raw_json_string(false); } +simdjson_inline simdjson_result document_reference::get_bool() noexcept { return doc->get_root_value_iterator().get_root_bool(false); } +simdjson_inline simdjson_result document_reference::get_value() noexcept { return doc->get_value(); } +simdjson_inline simdjson_result document_reference::is_null() noexcept { return doc->get_root_value_iterator().is_root_null(false); } +template<> simdjson_inline simdjson_result document_reference::get() & noexcept { return get_array(); } +template<> simdjson_inline simdjson_result document_reference::get() & noexcept { return get_object(); } +template<> simdjson_inline simdjson_result document_reference::get() & noexcept { return get_raw_json_string(); } +template<> simdjson_inline simdjson_result document_reference::get() & noexcept { return get_string(false); } +template<> simdjson_inline simdjson_result document_reference::get() & noexcept { return get_double(); } +template<> simdjson_inline simdjson_result document_reference::get() & noexcept { return get_uint64(); } +template<> simdjson_inline simdjson_result document_reference::get() & noexcept { return get_int64(); } +template<> simdjson_inline simdjson_result document_reference::get() & noexcept { return get_bool(); } +template<> simdjson_inline simdjson_result document_reference::get() & noexcept { return get_value(); } +#if SIMDJSON_EXCEPTIONS +template +simdjson_inline document_reference::operator T() noexcept(false) { return get(); } +simdjson_inline document_reference::operator array() & noexcept(false) { return array(*doc); } +simdjson_inline document_reference::operator object() & noexcept(false) { return object(*doc); } +simdjson_inline document_reference::operator uint64_t() noexcept(false) { return get_uint64(); } +simdjson_inline document_reference::operator int64_t() noexcept(false) { return get_int64(); } +simdjson_inline document_reference::operator double() noexcept(false) { return get_double(); } +simdjson_inline document_reference::operator std::string_view() noexcept(false) { return std::string_view(*doc); } +simdjson_inline document_reference::operator raw_json_string() noexcept(false) { return get_raw_json_string(); } +simdjson_inline document_reference::operator bool() noexcept(false) { return get_bool(); } +simdjson_inline document_reference::operator value() noexcept(false) { return value(*doc); } +#endif +simdjson_inline simdjson_result document_reference::count_elements() & noexcept { return doc->count_elements(); } +simdjson_inline simdjson_result document_reference::count_fields() & noexcept { return doc->count_fields(); } +simdjson_inline simdjson_result document_reference::at(size_t index) & noexcept { return doc->at(index); } +simdjson_inline simdjson_result document_reference::begin() & noexcept { return doc->begin(); } +simdjson_inline simdjson_result document_reference::end() & noexcept { return doc->end(); } +simdjson_inline simdjson_result document_reference::find_field(std::string_view key) & noexcept { return doc->find_field(key); } +simdjson_inline simdjson_result document_reference::find_field(const char *key) & noexcept { return doc->find_field(key); } +simdjson_inline simdjson_result document_reference::operator[](std::string_view key) & noexcept { return (*doc)[key]; } +simdjson_inline simdjson_result document_reference::operator[](const char *key) & noexcept { return (*doc)[key]; } +simdjson_inline simdjson_result document_reference::find_field_unordered(std::string_view key) & noexcept { return doc->find_field_unordered(key); } +simdjson_inline simdjson_result document_reference::find_field_unordered(const char *key) & noexcept { return doc->find_field_unordered(key); } +simdjson_inline simdjson_result document_reference::type() noexcept { return doc->type(); } +simdjson_inline simdjson_result document_reference::is_scalar() noexcept { return doc->is_scalar(); } +simdjson_inline simdjson_result document_reference::is_string() noexcept { return doc->is_string(); } +simdjson_inline simdjson_result document_reference::current_location() noexcept { return doc->current_location(); } +simdjson_inline int32_t document_reference::current_depth() const noexcept { return doc->current_depth(); } +simdjson_inline bool document_reference::is_negative() noexcept { return doc->is_negative(); } +simdjson_inline simdjson_result document_reference::is_integer() noexcept { return doc->get_root_value_iterator().is_root_integer(false); } +simdjson_inline simdjson_result document_reference::get_number_type() noexcept { return doc->get_root_value_iterator().get_root_number_type(false); } +simdjson_inline simdjson_result document_reference::get_number() noexcept { return doc->get_root_value_iterator().get_root_number(false); } +simdjson_inline simdjson_result document_reference::raw_json_token() noexcept { return doc->raw_json_token(); } +simdjson_inline simdjson_result document_reference::at_pointer(std::string_view json_pointer) noexcept { return doc->at_pointer(json_pointer); } +simdjson_inline simdjson_result document_reference::at_path(std::string_view json_path) noexcept { return doc->at_path(json_path); } +simdjson_inline simdjson_result document_reference::raw_json() noexcept { return doc->raw_json();} +simdjson_inline document_reference::operator document&() const noexcept { return *doc; } + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + + + +namespace simdjson { +simdjson_inline simdjson_result::simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::document_reference value, error_code error) + noexcept : implementation_simdjson_result_base(std::forward(value), error) {} + + +simdjson_inline simdjson_result simdjson_result::count_elements() & noexcept { + if (error()) { return error(); } + return first.count_elements(); +} +simdjson_inline simdjson_result simdjson_result::count_fields() & noexcept { + if (error()) { return error(); } + return first.count_fields(); +} +simdjson_inline simdjson_result simdjson_result::at(size_t index) & noexcept { + if (error()) { return error(); } + return first.at(index); +} +simdjson_inline error_code simdjson_result::rewind() noexcept { + if (error()) { return error(); } + first.rewind(); + return SUCCESS; +} +simdjson_inline simdjson_result simdjson_result::begin() & noexcept { + if (error()) { return error(); } + return first.begin(); +} +simdjson_inline simdjson_result simdjson_result::end() & noexcept { + return {}; +} +simdjson_inline simdjson_result simdjson_result::find_field_unordered(std::string_view key) & noexcept { + if (error()) { return error(); } + return first.find_field_unordered(key); +} +simdjson_inline simdjson_result simdjson_result::find_field_unordered(const char *key) & noexcept { + if (error()) { return error(); } + return first.find_field_unordered(key); +} +simdjson_inline simdjson_result simdjson_result::operator[](std::string_view key) & noexcept { + if (error()) { return error(); } + return first[key]; +} +simdjson_inline simdjson_result simdjson_result::operator[](const char *key) & noexcept { + if (error()) { return error(); } + return first[key]; +} +simdjson_inline simdjson_result simdjson_result::find_field(std::string_view key) & noexcept { + if (error()) { return error(); } + return first.find_field(key); +} +simdjson_inline simdjson_result simdjson_result::find_field(const char *key) & noexcept { + if (error()) { return error(); } + return first.find_field(key); +} +simdjson_inline simdjson_result simdjson_result::get_array() & noexcept { + if (error()) { return error(); } + return first.get_array(); +} +simdjson_inline simdjson_result simdjson_result::get_object() & noexcept { + if (error()) { return error(); } + return first.get_object(); +} +simdjson_inline simdjson_result simdjson_result::get_uint64() noexcept { + if (error()) { return error(); } + return first.get_uint64(); +} +simdjson_inline simdjson_result simdjson_result::get_uint64_in_string() noexcept { + if (error()) { return error(); } + return first.get_uint64_in_string(); +} +simdjson_inline simdjson_result simdjson_result::get_int64() noexcept { + if (error()) { return error(); } + return first.get_int64(); +} +simdjson_inline simdjson_result simdjson_result::get_int64_in_string() noexcept { + if (error()) { return error(); } + return first.get_int64_in_string(); +} +simdjson_inline simdjson_result simdjson_result::get_double() noexcept { + if (error()) { return error(); } + return first.get_double(); +} +simdjson_inline simdjson_result simdjson_result::get_double_in_string() noexcept { + if (error()) { return error(); } + return first.get_double_in_string(); +} +simdjson_inline simdjson_result simdjson_result::get_string(bool allow_replacement) noexcept { + if (error()) { return error(); } + return first.get_string(allow_replacement); +} +template +simdjson_inline error_code simdjson_result::get_string(string_type& receiver, bool allow_replacement) noexcept { + if (error()) { return error(); } + return first.get_string(receiver, allow_replacement); +} +simdjson_inline simdjson_result simdjson_result::get_wobbly_string() noexcept { + if (error()) { return error(); } + return first.get_wobbly_string(); +} +simdjson_inline simdjson_result simdjson_result::get_raw_json_string() noexcept { + if (error()) { return error(); } + return first.get_raw_json_string(); +} +simdjson_inline simdjson_result simdjson_result::get_bool() noexcept { + if (error()) { return error(); } + return first.get_bool(); +} +simdjson_inline simdjson_result simdjson_result::get_value() noexcept { + if (error()) { return error(); } + return first.get_value(); +} +simdjson_inline simdjson_result simdjson_result::is_null() noexcept { + if (error()) { return error(); } + return first.is_null(); +} +simdjson_inline simdjson_result simdjson_result::type() noexcept { + if (error()) { return error(); } + return first.type(); +} +simdjson_inline simdjson_result simdjson_result::is_scalar() noexcept { + if (error()) { return error(); } + return first.is_scalar(); +} +simdjson_inline simdjson_result simdjson_result::is_string() noexcept { + if (error()) { return error(); } + return first.is_string(); +} +simdjson_inline simdjson_result simdjson_result::is_negative() noexcept { + if (error()) { return error(); } + return first.is_negative(); +} +simdjson_inline simdjson_result simdjson_result::is_integer() noexcept { + if (error()) { return error(); } + return first.is_integer(); +} +simdjson_inline simdjson_result simdjson_result::get_number_type() noexcept { + if (error()) { return error(); } + return first.get_number_type(); +} +simdjson_inline simdjson_result simdjson_result::get_number() noexcept { + if (error()) { return error(); } + return first.get_number(); +} +#if SIMDJSON_EXCEPTIONS +template ::value == false>::type> +simdjson_inline simdjson_result::operator T() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator SIMDJSON_IMPLEMENTATION::ondemand::array() & noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator SIMDJSON_IMPLEMENTATION::ondemand::object() & noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator uint64_t() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator int64_t() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator double() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator std::string_view() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator SIMDJSON_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator bool() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator SIMDJSON_IMPLEMENTATION::ondemand::value() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +#endif + +simdjson_inline simdjson_result simdjson_result::current_location() noexcept { + if (error()) { return error(); } + return first.current_location(); +} + +simdjson_inline simdjson_result simdjson_result::raw_json_token() noexcept { + if (error()) { return error(); } + return first.raw_json_token(); +} + +simdjson_inline simdjson_result simdjson_result::at_pointer(std::string_view json_pointer) noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} + +simdjson_inline simdjson_result simdjson_result::at_path(std::string_view json_path) noexcept { + if (error()) { + return error(); + } + return first.at_path(json_path); +} + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/document.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/document.h new file mode 100644 index 000000000000..390180e6f42a --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/document.h @@ -0,0 +1,914 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/json_iterator.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * A JSON document. It holds a json_iterator instance. + * + * Used by tokens to get text, and string buffer location. + * + * You must keep the document around during iteration. + */ +class document { +public: + /** + * Create a new invalid document. + * + * Exists so you can declare a variable and later assign to it before use. + */ + simdjson_inline document() noexcept = default; + simdjson_inline document(const document &other) noexcept = delete; // pass your documents by reference, not by copy + simdjson_inline document(document &&other) noexcept = default; + simdjson_inline document &operator=(const document &other) noexcept = delete; + simdjson_inline document &operator=(document &&other) noexcept = default; + + /** + * Cast this JSON value to an array. + * + * @returns An object that can be used to iterate the array. + * @returns INCORRECT_TYPE If the JSON value is not an array. + */ + simdjson_inline simdjson_result get_array() & noexcept; + /** + * Cast this JSON value to an object. + * + * @returns An object that can be used to look up or iterate fields. + * @returns INCORRECT_TYPE If the JSON value is not an object. + */ + simdjson_inline simdjson_result get_object() & noexcept; + /** + * Cast this JSON value to an unsigned integer. + * + * @returns A signed 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer. + */ + simdjson_inline simdjson_result get_uint64() noexcept; + /** + * Cast this JSON value (inside string) to an unsigned integer. + * + * @returns A signed 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer. + */ + simdjson_inline simdjson_result get_uint64_in_string() noexcept; + /** + * Cast this JSON value to a signed integer. + * + * @returns A signed 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit integer. + */ + simdjson_inline simdjson_result get_int64() noexcept; + /** + * Cast this JSON value (inside string) to a signed integer. + * + * @returns A signed 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit integer. + */ + simdjson_inline simdjson_result get_int64_in_string() noexcept; + /** + * Cast this JSON value to a double. + * + * @returns A double. + * @returns INCORRECT_TYPE If the JSON value is not a valid floating-point number. + */ + simdjson_inline simdjson_result get_double() noexcept; + + /** + * Cast this JSON value (inside string) to a double. + * + * @returns A double. + * @returns INCORRECT_TYPE If the JSON value is not a valid floating-point number. + */ + simdjson_inline simdjson_result get_double_in_string() noexcept; + /** + * Cast this JSON value to a string. + * + * The string is guaranteed to be valid UTF-8. + * + * Important: Calling get_string() twice on the same document is an error. + * + * @param Whether to allow a replacement character for unmatched surrogate pairs. + * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next + * time it parses a document or when it is destroyed. + * @returns INCORRECT_TYPE if the JSON value is not a string. + */ + simdjson_inline simdjson_result get_string(bool allow_replacement = false) noexcept; + /** + * Attempts to fill the provided std::string reference with the parsed value of the current string. + * + * The string is guaranteed to be valid UTF-8. + * + * Important: a value should be consumed once. Calling get_string() twice on the same value + * is an error. + * + * Performance: This method may be slower than get_string() or get_string(bool) because it may need to allocate memory. + * We recommend you avoid allocating an std::string unless you need to. + * + * @returns INCORRECT_TYPE if the JSON value is not a string. Otherwise, we return SUCCESS. + */ + template + simdjson_inline error_code get_string(string_type& receiver, bool allow_replacement = false) noexcept; + /** + * Cast this JSON value to a string. + * + * The string is not guaranteed to be valid UTF-8. See https://simonsapin.github.io/wtf-8/ + * + * Important: Calling get_wobbly_string() twice on the same document is an error. + * + * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next + * time it parses a document or when it is destroyed. + * @returns INCORRECT_TYPE if the JSON value is not a string. + */ + simdjson_inline simdjson_result get_wobbly_string() noexcept; + /** + * Cast this JSON value to a raw_json_string. + * + * The string is guaranteed to be valid UTF-8, and may have escapes in it (e.g. \\ or \n). + * + * @returns A pointer to the raw JSON for the given string. + * @returns INCORRECT_TYPE if the JSON value is not a string. + */ + simdjson_inline simdjson_result get_raw_json_string() noexcept; + /** + * Cast this JSON value to a bool. + * + * @returns A bool value. + * @returns INCORRECT_TYPE if the JSON value is not true or false. + */ + simdjson_inline simdjson_result get_bool() noexcept; + /** + * Cast this JSON value to a value when the document is an object or an array. + * + * You must not have begun iterating through the object or array. When + * SIMDJSON_DEVELOPMENT_CHECKS is set to 1 (which is the case when building in Debug mode + * by default), and you have already begun iterating, + * you will get an OUT_OF_ORDER_ITERATION error. If you have begun iterating, you can use + * rewind() to reset the document to its initial state before calling this method. + * + * @returns A value if a JSON array or object cannot be found. + * @returns SCALAR_DOCUMENT_AS_VALUE error is the document is a scalar (see is_scalar() function). + */ + simdjson_inline simdjson_result get_value() noexcept; + + /** + * Checks if this JSON value is null. If and only if the value is + * null, then it is consumed (we advance). If we find a token that + * begins with 'n' but is not 'null', then an error is returned. + * + * @returns Whether the value is null. + * @returns INCORRECT_TYPE If the JSON value begins with 'n' and is not 'null'. + */ + simdjson_inline simdjson_result is_null() noexcept; + + /** + * Get this value as the given type. + * + * Supported types: object, array, raw_json_string, string_view, uint64_t, int64_t, double, bool + * + * You may use get_double(), get_bool(), get_uint64(), get_int64(), + * get_object(), get_array(), get_raw_json_string(), or get_string() instead. + * + * @returns A value of the given type, parsed from the JSON. + * @returns INCORRECT_TYPE If the JSON value is not the given type. + */ + template simdjson_inline simdjson_result get() & noexcept { + // Unless the simdjson library or the user provides an inline implementation, calling this method should + // immediately fail. + static_assert(!sizeof(T), "The get method with given type is not implemented by the simdjson library. " + "The supported types are ondemand::object, ondemand::array, raw_json_string, std::string_view, uint64_t, " + "int64_t, double, and bool. We recommend you use get_double(), get_bool(), get_uint64(), get_int64(), " + " get_object(), get_array(), get_raw_json_string(), or get_string() instead of the get template." + " You may also add support for custom types, see our documentation."); + } + /** @overload template simdjson_result get() & noexcept */ + template simdjson_deprecated simdjson_inline simdjson_result get() && noexcept { + // Unless the simdjson library or the user provides an inline implementation, calling this method should + // immediately fail. + static_assert(!sizeof(T), "The get method with given type is not implemented by the simdjson library. " + "The supported types are ondemand::object, ondemand::array, raw_json_string, std::string_view, uint64_t, " + "int64_t, double, and bool. We recommend you use get_double(), get_bool(), get_uint64(), get_int64(), " + " get_object(), get_array(), get_raw_json_string(), or get_string() instead of the get template." + " You may also add support for custom types, see our documentation."); + } + + /** + * Get this value as the given type. + * + * Supported types: object, array, raw_json_string, string_view, uint64_t, int64_t, double, bool, value + * + * Be mindful that the document instance must remain in scope while you are accessing object, array and value instances. + * + * @param out This is set to a value of the given type, parsed from the JSON. If there is an error, this may not be initialized. + * @returns INCORRECT_TYPE If the JSON value is not an object. + * @returns SUCCESS If the parse succeeded and the out parameter was set to the value. + */ + template simdjson_inline error_code get(T &out) & noexcept; + /** @overload template error_code get(T &out) & noexcept */ + template simdjson_deprecated simdjson_inline error_code get(T &out) && noexcept; + +#if SIMDJSON_EXCEPTIONS + /** + * Cast this JSON value to an instance of type T. The programmer is responsible for + * providing an implementation of get for the type T, if T is not one of the types + * supported by the library (object, array, raw_json_string, string_view, uint64_t, etc.) + * + * See https://github.com/simdjson/simdjson/blob/master/doc/basics.md#adding-support-for-custom-types + * + * @returns An instance of type T + */ + template + explicit simdjson_inline operator T() & noexcept(false); + template + explicit simdjson_deprecated simdjson_inline operator T() && noexcept(false); + + /** + * Cast this JSON value to an array. + * + * @returns An object that can be used to iterate the array. + * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not an array. + */ + simdjson_inline operator array() & noexcept(false); + /** + * Cast this JSON value to an object. + * + * @returns An object that can be used to look up or iterate fields. + * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not an object. + */ + simdjson_inline operator object() & noexcept(false); + /** + * Cast this JSON value to an unsigned integer. + * + * @returns A signed 64-bit integer. + * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a 64-bit unsigned integer. + */ + simdjson_inline operator uint64_t() noexcept(false); + /** + * Cast this JSON value to a signed integer. + * + * @returns A signed 64-bit integer. + * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a 64-bit integer. + */ + simdjson_inline operator int64_t() noexcept(false); + /** + * Cast this JSON value to a double. + * + * @returns A double. + * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a valid floating-point number. + */ + simdjson_inline operator double() noexcept(false); + /** + * Cast this JSON value to a string. + * + * The string is guaranteed to be valid UTF-8. + * + * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next + * time it parses a document or when it is destroyed. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not a string. + */ + simdjson_inline operator std::string_view() noexcept(false); + /** + * Cast this JSON value to a raw_json_string. + * + * The string is guaranteed to be valid UTF-8, and may have escapes in it (e.g. \\ or \n). + * + * @returns A pointer to the raw JSON for the given string. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not a string. + */ + simdjson_inline operator raw_json_string() noexcept(false); + /** + * Cast this JSON value to a bool. + * + * @returns A bool value. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not true or false. + */ + simdjson_inline operator bool() noexcept(false); + /** + * Cast this JSON value to a value when the document is an object or an array. + * + * You must not have begun iterating through the object or array. When + * SIMDJSON_DEVELOPMENT_CHECKS is defined, and you have already begun iterating, + * you will get an OUT_OF_ORDER_ITERATION error. If you have begun iterating, you can use + * rewind() to reset the document to its initial state before calling this method. + * + * @returns A value value if a JSON array or object cannot be found. + * @exception SCALAR_DOCUMENT_AS_VALUE error is the document is a scalar (see is_scalar() function). + */ + simdjson_inline operator value() noexcept(false); +#endif + /** + * This method scans the array and counts the number of elements. + * The count_elements method should always be called before you have begun + * iterating through the array: it is expected that you are pointing at + * the beginning of the array. + * The runtime complexity is linear in the size of the array. After + * calling this function, if successful, the array is 'rewinded' at its + * beginning as if it had never been accessed. If the JSON is malformed (e.g., + * there is a missing comma), then an error is returned and it is no longer + * safe to continue. + */ + simdjson_inline simdjson_result count_elements() & noexcept; + /** + * This method scans the object and counts the number of key-value pairs. + * The count_fields method should always be called before you have begun + * iterating through the object: it is expected that you are pointing at + * the beginning of the object. + * The runtime complexity is linear in the size of the object. After + * calling this function, if successful, the object is 'rewinded' at its + * beginning as if it had never been accessed. If the JSON is malformed (e.g., + * there is a missing comma), then an error is returned and it is no longer + * safe to continue. + * + * To check that an object is empty, it is more performant to use + * the is_empty() method. + */ + simdjson_inline simdjson_result count_fields() & noexcept; + /** + * Get the value at the given index in the array. This function has linear-time complexity. + * This function should only be called once on an array instance since the array iterator is not reset between each call. + * + * @return The value at the given index, or: + * - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length + */ + simdjson_inline simdjson_result at(size_t index) & noexcept; + /** + * Begin array iteration. + * + * Part of the std::iterable interface. + */ + simdjson_inline simdjson_result begin() & noexcept; + /** + * Sentinel representing the end of the array. + * + * Part of the std::iterable interface. + */ + simdjson_inline simdjson_result end() & noexcept; + + /** + * Look up a field by name on an object (order-sensitive). + * + * The following code reads z, then y, then x, and thus will not retrieve x or y if fed the + * JSON `{ "x": 1, "y": 2, "z": 3 }`: + * + * ```c++ + * simdjson::ondemand::parser parser; + * auto obj = parser.parse(R"( { "x": 1, "y": 2, "z": 3 } )"_padded); + * double z = obj.find_field("z"); + * double y = obj.find_field("y"); + * double x = obj.find_field("x"); + * ``` + * + * **Raw Keys:** The lookup will be done against the *raw* key, and will not unescape keys. + * e.g. `object["a"]` will match `{ "a": 1 }`, but will *not* match `{ "\u0061": 1 }`. + * + * + * You must consume the fields on an object one at a time. A request for a new key + * invalidates previous field values: it makes them unsafe. E.g., the array + * given by content["bids"].get_array() should not be accessed after you have called + * content["asks"].get_array(). You can detect such mistakes by first compiling and running + * the code in Debug mode (or with the macro `SIMDJSON_DEVELOPMENT_CHECKS` set to 1): an + * OUT_OF_ORDER_ITERATION error is generated. + * + * You are expected to access keys only once. You should access the value corresponding to + * a key a single time. Doing object["mykey"].to_string()and then again object["mykey"].to_string() + * is an error. + * + * @param key The key to look up. + * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object. + */ + simdjson_inline simdjson_result find_field(std::string_view key) & noexcept; + /** @overload simdjson_inline simdjson_result find_field(std::string_view key) & noexcept; */ + simdjson_inline simdjson_result find_field(const char *key) & noexcept; + + /** + * Look up a field by name on an object, without regard to key order. + * + * **Performance Notes:** This is a bit less performant than find_field(), though its effect varies + * and often appears negligible. It starts out normally, starting out at the last field; but if + * the field is not found, it scans from the beginning of the object to see if it missed it. That + * missing case has a non-cache-friendly bump and lots of extra scanning, especially if the object + * in question is large. The fact that the extra code is there also bumps the executable size. + * + * It is the default, however, because it would be highly surprising (and hard to debug) if the + * default behavior failed to look up a field just because it was in the wrong order--and many + * APIs assume this. Therefore, you must be explicit if you want to treat objects as out of order. + * + * Use find_field() if you are sure fields will be in order (or are willing to treat it as if the + * field was not there when they are not in order). + * + * You must consume the fields on an object one at a time. A request for a new key + * invalidates previous field values: it makes them unsafe. E.g., the array + * given by content["bids"].get_array() should not be accessed after you have called + * content["asks"].get_array(). You can detect such mistakes by first compiling and running + * the code in Debug mode (or with the macro `SIMDJSON_DEVELOPMENT_CHECKS` set to 1): an + * OUT_OF_ORDER_ITERATION error is generated. + * + * You are expected to access keys only once. You should access the value corresponding to a key + * a single time. Doing object["mykey"].to_string() and then again object["mykey"].to_string() + * is an error. + * + * @param key The key to look up. + * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object. + */ + simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; */ + simdjson_inline simdjson_result find_field_unordered(const char *key) & noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; */ + simdjson_inline simdjson_result operator[](std::string_view key) & noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; */ + simdjson_inline simdjson_result operator[](const char *key) & noexcept; + + /** + * Get the type of this JSON value. It does not validate or consume the value. + * E.g., you must still call "is_null()" to check that a value is null even if + * "type()" returns json_type::null. + * + * NOTE: If you're only expecting a value to be one type (a typical case), it's generally + * better to just call .get_double, .get_string, etc. and check for INCORRECT_TYPE (or just + * let it throw an exception). + * + * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse". + */ + simdjson_inline simdjson_result type() noexcept; + + /** + * Checks whether the document is a scalar (string, number, null, Boolean). + * Returns false when there it is an array or object. + * + * @returns true if the type is string, number, null, Boolean + * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse". + */ + simdjson_inline simdjson_result is_scalar() noexcept; + + /** + * Checks whether the document is a string. + * + * @returns true if the type is string + * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse". + */ + simdjson_inline simdjson_result is_string() noexcept; + + /** + * Checks whether the document is a negative number. + * + * @returns true if the number if negative. + */ + simdjson_inline bool is_negative() noexcept; + /** + * Checks whether the document is an integer number. Note that + * this requires to partially parse the number string. If + * the value is determined to be an integer, it may still + * not parse properly as an integer in subsequent steps + * (e.g., it might overflow). + * + * @returns true if the number if negative. + */ + simdjson_inline simdjson_result is_integer() noexcept; + /** + * Determine the number type (integer or floating-point number) as quickly + * as possible. This function does not fully validate the input. It is + * useful when you only need to classify the numbers, without parsing them. + * + * If you are planning to retrieve the value or you need full validation, + * consider using the get_number() method instead: it will fully parse + * and validate the input, and give you access to the type: + * get_number().get_number_type(). + * + * get_number_type() is number_type::unsigned_integer if we have + * an integer greater or equal to 9223372036854775808 and no larger than 18446744073709551615. + * get_number_type() is number_type::signed_integer if we have an + * integer that is less than 9223372036854775808 and greater or equal to -9223372036854775808. + * get_number_type() is number_type::big_integer if we have an integer outside + * of those ranges (either larger than 18446744073709551615 or smaller than -9223372036854775808). + * Otherwise, get_number_type() has value number_type::floating_point_number + * + * This function requires processing the number string, but it is expected + * to be faster than get_number().get_number_type() because it is does not + * parse the number value. + * + * @returns the type of the number + */ + simdjson_inline simdjson_result get_number_type() noexcept; + + /** + * Attempt to parse an ondemand::number. An ondemand::number may + * contain an integer value or a floating-point value, the simdjson + * library will autodetect the type. Thus it is a dynamically typed + * number. Before accessing the value, you must determine the detected + * type. + * + * number.get_number_type() is number_type::signed_integer if we have + * an integer in [-9223372036854775808,9223372036854775808) + * You can recover the value by calling number.get_int64() and you + * have that number.is_int64() is true. + * + * number.get_number_type() is number_type::unsigned_integer if we have + * an integer in [9223372036854775808,18446744073709551616) + * You can recover the value by calling number.get_uint64() and you + * have that number.is_uint64() is true. + * + * Otherwise, number.get_number_type() has value number_type::floating_point_number + * and we have a binary64 number. + * You can recover the value by calling number.get_double() and you + * have that number.is_double() is true. + * + * You must check the type before accessing the value: it is an error + * to call "get_int64()" when number.get_number_type() is not + * number_type::signed_integer and when number.is_int64() is false. + */ + simdjson_warn_unused simdjson_inline simdjson_result get_number() noexcept; + + /** + * Get the raw JSON for this token. + * + * The string_view will always point into the input buffer. + * + * The string_view will start at the beginning of the token, and include the entire token + * *as well as all spaces until the next token (or EOF).* This means, for example, that a + * string token always begins with a " and is always terminated by the final ", possibly + * followed by a number of spaces. + * + * The string_view is *not* null-terminated. If this is a scalar (string, number, + * boolean, or null), the character after the end of the string_view may be the padded buffer. + * + * Tokens include: + * - { + * - [ + * - "a string (possibly with UTF-8 or backslashed characters like \\\")". + * - -1.2e-100 + * - true + * - false + * - null + */ + simdjson_inline simdjson_result raw_json_token() noexcept; + + /** + * Reset the iterator inside the document instance so we are pointing back at the + * beginning of the document, as if it had just been created. It invalidates all + * values, objects and arrays that you have created so far (including unescaped strings). + */ + inline void rewind() noexcept; + /** + * Returns debugging information. + */ + inline std::string to_debug_string() noexcept; + /** + * Some unrecoverable error conditions may render the document instance unusable. + * The is_alive() method returns true when the document is still suitable. + */ + inline bool is_alive() noexcept; + + /** + * Returns the current location in the document if in bounds. + */ + inline simdjson_result current_location() const noexcept; + + /** + * Returns true if this document has been fully parsed. + * If you have consumed the whole document and at_end() returns + * false, then there may be trailing content. + */ + inline bool at_end() const noexcept; + + /** + * Returns the current depth in the document if in bounds. + * + * E.g., + * 0 = finished with document + * 1 = document root value (could be [ or {, not yet known) + * 2 = , or } inside root array/object + * 3 = key or value inside root array/object. + */ + simdjson_inline int32_t current_depth() const noexcept; + + /** + * Get the value associated with the given JSON pointer. We use the RFC 6901 + * https://tools.ietf.org/html/rfc6901 standard. + * + * ondemand::parser parser; + * auto json = R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("/foo/a/1") == 20 + * + * It is allowed for a key to be the empty string: + * + * ondemand::parser parser; + * auto json = R"({ "": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("//a/1") == 20 + * + * Key values are matched exactly, without unescaping or Unicode normalization. + * We do a byte-by-byte comparison. E.g. + * + * const padded_string json = "{\"\\u00E9\":123}"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("/\\u00E9") == 123 + * doc.at_pointer((const char*)u8"/\u00E9") returns an error (NO_SUCH_FIELD) + * + * Note that at_pointer() automatically calls rewind between each call. Thus + * all values, objects and arrays that you have created so far (including unescaped strings) + * are invalidated. After calling at_pointer, you need to consume the result: string values + * should be stored in your own variables, arrays should be decoded and stored in your own array-like + * structures and so forth. + * + * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + * - SCALAR_DOCUMENT_AS_VALUE if the json_pointer is empty and the document is not a scalar (see is_scalar() function). + */ + simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + + /** + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key + * names and array indices. + * + * https://datatracker.ietf.org/doc/html/draft-normington-jsonpath-00 + * + * Key values are matched exactly, without unescaping or Unicode normalization. + * We do a byte-by-byte comparison. E.g. + * + * const padded_string json = "{\"\\u00E9\":123}"_padded; + * auto doc = parser.iterate(json); + * doc.at_path(".\\u00E9") == 123 + * doc.at_path((const char*)u8".\u00E9") returns an error (NO_SUCH_FIELD) + * + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + */ + simdjson_inline simdjson_result at_path(std::string_view json_path) noexcept; + + /** + * Consumes the document and returns a string_view instance corresponding to the + * document as represented in JSON. It points inside the original byte array containing + * the JSON document. + */ + simdjson_inline simdjson_result raw_json() noexcept; +protected: + /** + * Consumes the document. + */ + simdjson_inline error_code consume() noexcept; + + simdjson_inline document(ondemand::json_iterator &&iter) noexcept; + simdjson_inline const uint8_t *text(uint32_t idx) const noexcept; + + simdjson_inline value_iterator resume_value_iterator() noexcept; + simdjson_inline value_iterator get_root_value_iterator() noexcept; + simdjson_inline simdjson_result start_or_resume_object() noexcept; + static simdjson_inline document start(ondemand::json_iterator &&iter) noexcept; + + // + // Fields + // + json_iterator iter{}; ///< Current position in the document + static constexpr depth_t DOCUMENT_DEPTH = 0; ///< document depth is always 0 + + friend class array_iterator; + friend class value; + friend class ondemand::parser; + friend class object; + friend class array; + friend class field; + friend class token; + friend class document_stream; + friend class document_reference; +}; + + +/** + * A document_reference is a thin wrapper around a document reference instance. + */ +class document_reference { +public: + simdjson_inline document_reference() noexcept; + simdjson_inline document_reference(document &d) noexcept; + simdjson_inline document_reference(const document_reference &other) noexcept = default; + simdjson_inline document_reference& operator=(const document_reference &other) noexcept = default; + simdjson_inline void rewind() noexcept; + simdjson_inline simdjson_result get_array() & noexcept; + simdjson_inline simdjson_result get_object() & noexcept; + simdjson_inline simdjson_result get_uint64() noexcept; + simdjson_inline simdjson_result get_uint64_in_string() noexcept; + simdjson_inline simdjson_result get_int64() noexcept; + simdjson_inline simdjson_result get_int64_in_string() noexcept; + simdjson_inline simdjson_result get_double() noexcept; + simdjson_inline simdjson_result get_double_in_string() noexcept; + simdjson_inline simdjson_result get_string(bool allow_replacement = false) noexcept; + template + simdjson_inline error_code get_string(string_type& receiver, bool allow_replacement = false) noexcept; + simdjson_inline simdjson_result get_wobbly_string() noexcept; + simdjson_inline simdjson_result get_raw_json_string() noexcept; + simdjson_inline simdjson_result get_bool() noexcept; + simdjson_inline simdjson_result get_value() noexcept; + + simdjson_inline simdjson_result is_null() noexcept; + template simdjson_inline simdjson_result get() & noexcept; + simdjson_inline simdjson_result raw_json() noexcept; + simdjson_inline operator document&() const noexcept; +#if SIMDJSON_EXCEPTIONS + template + explicit simdjson_inline operator T() noexcept(false); + simdjson_inline operator array() & noexcept(false); + simdjson_inline operator object() & noexcept(false); + simdjson_inline operator uint64_t() noexcept(false); + simdjson_inline operator int64_t() noexcept(false); + simdjson_inline operator double() noexcept(false); + simdjson_inline operator std::string_view() noexcept(false); + simdjson_inline operator raw_json_string() noexcept(false); + simdjson_inline operator bool() noexcept(false); + simdjson_inline operator value() noexcept(false); +#endif + simdjson_inline simdjson_result count_elements() & noexcept; + simdjson_inline simdjson_result count_fields() & noexcept; + simdjson_inline simdjson_result at(size_t index) & noexcept; + simdjson_inline simdjson_result begin() & noexcept; + simdjson_inline simdjson_result end() & noexcept; + simdjson_inline simdjson_result find_field(std::string_view key) & noexcept; + simdjson_inline simdjson_result find_field(const char *key) & noexcept; + simdjson_inline simdjson_result operator[](std::string_view key) & noexcept; + simdjson_inline simdjson_result operator[](const char *key) & noexcept; + simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; + simdjson_inline simdjson_result find_field_unordered(const char *key) & noexcept; + + simdjson_inline simdjson_result type() noexcept; + simdjson_inline simdjson_result is_scalar() noexcept; + simdjson_inline simdjson_result is_string() noexcept; + + simdjson_inline simdjson_result current_location() noexcept; + simdjson_inline int32_t current_depth() const noexcept; + simdjson_inline bool is_negative() noexcept; + simdjson_inline simdjson_result is_integer() noexcept; + simdjson_inline simdjson_result get_number_type() noexcept; + simdjson_inline simdjson_result get_number() noexcept; + simdjson_inline simdjson_result raw_json_token() noexcept; + simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + simdjson_inline simdjson_result at_path(std::string_view json_path) noexcept; + +private: + document *doc{nullptr}; +}; +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::document &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; + simdjson_inline error_code rewind() noexcept; + + simdjson_inline simdjson_result get_array() & noexcept; + simdjson_inline simdjson_result get_object() & noexcept; + simdjson_inline simdjson_result get_uint64() noexcept; + simdjson_inline simdjson_result get_uint64_in_string() noexcept; + simdjson_inline simdjson_result get_int64() noexcept; + simdjson_inline simdjson_result get_int64_in_string() noexcept; + simdjson_inline simdjson_result get_double() noexcept; + simdjson_inline simdjson_result get_double_in_string() noexcept; + simdjson_inline simdjson_result get_string(bool allow_replacement = false) noexcept; + template + simdjson_inline error_code get_string(string_type& receiver, bool allow_replacement = false) noexcept; + simdjson_inline simdjson_result get_wobbly_string() noexcept; + simdjson_inline simdjson_result get_raw_json_string() noexcept; + simdjson_inline simdjson_result get_bool() noexcept; + simdjson_inline simdjson_result get_value() noexcept; + simdjson_inline simdjson_result is_null() noexcept; + + template simdjson_inline simdjson_result get() & noexcept; + template simdjson_deprecated simdjson_inline simdjson_result get() && noexcept; + + template simdjson_inline error_code get(T &out) & noexcept; + template simdjson_inline error_code get(T &out) && noexcept; +#if SIMDJSON_EXCEPTIONS + template ::value == false>::type> + explicit simdjson_inline operator T() noexcept(false); + simdjson_inline operator SIMDJSON_IMPLEMENTATION::ondemand::array() & noexcept(false); + simdjson_inline operator SIMDJSON_IMPLEMENTATION::ondemand::object() & noexcept(false); + simdjson_inline operator uint64_t() noexcept(false); + simdjson_inline operator int64_t() noexcept(false); + simdjson_inline operator double() noexcept(false); + simdjson_inline operator std::string_view() noexcept(false); + simdjson_inline operator SIMDJSON_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false); + simdjson_inline operator bool() noexcept(false); + simdjson_inline operator SIMDJSON_IMPLEMENTATION::ondemand::value() noexcept(false); +#endif + simdjson_inline simdjson_result count_elements() & noexcept; + simdjson_inline simdjson_result count_fields() & noexcept; + simdjson_inline simdjson_result at(size_t index) & noexcept; + simdjson_inline simdjson_result begin() & noexcept; + simdjson_inline simdjson_result end() & noexcept; + simdjson_inline simdjson_result find_field(std::string_view key) & noexcept; + simdjson_inline simdjson_result find_field(const char *key) & noexcept; + simdjson_inline simdjson_result operator[](std::string_view key) & noexcept; + simdjson_inline simdjson_result operator[](const char *key) & noexcept; + simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; + simdjson_inline simdjson_result find_field_unordered(const char *key) & noexcept; + simdjson_inline simdjson_result type() noexcept; + simdjson_inline simdjson_result is_scalar() noexcept; + simdjson_inline simdjson_result is_string() noexcept; + simdjson_inline simdjson_result current_location() noexcept; + simdjson_inline int32_t current_depth() const noexcept; + simdjson_inline bool at_end() const noexcept; + simdjson_inline bool is_negative() noexcept; + simdjson_inline simdjson_result is_integer() noexcept; + simdjson_inline simdjson_result get_number_type() noexcept; + simdjson_inline simdjson_result get_number() noexcept; + /** @copydoc simdjson_inline std::string_view document::raw_json_token() const noexcept */ + simdjson_inline simdjson_result raw_json_token() noexcept; + + simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + simdjson_inline simdjson_result at_path(std::string_view json_path) noexcept; +}; + + +} // namespace simdjson + + + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::document_reference value, error_code error) noexcept; + simdjson_inline simdjson_result() noexcept = default; + simdjson_inline error_code rewind() noexcept; + + simdjson_inline simdjson_result get_array() & noexcept; + simdjson_inline simdjson_result get_object() & noexcept; + simdjson_inline simdjson_result get_uint64() noexcept; + simdjson_inline simdjson_result get_uint64_in_string() noexcept; + simdjson_inline simdjson_result get_int64() noexcept; + simdjson_inline simdjson_result get_int64_in_string() noexcept; + simdjson_inline simdjson_result get_double() noexcept; + simdjson_inline simdjson_result get_double_in_string() noexcept; + simdjson_inline simdjson_result get_string(bool allow_replacement = false) noexcept; + template + simdjson_inline error_code get_string(string_type& receiver, bool allow_replacement = false) noexcept; + simdjson_inline simdjson_result get_wobbly_string() noexcept; + simdjson_inline simdjson_result get_raw_json_string() noexcept; + simdjson_inline simdjson_result get_bool() noexcept; + simdjson_inline simdjson_result get_value() noexcept; + simdjson_inline simdjson_result is_null() noexcept; +#if SIMDJSON_EXCEPTIONS + template ::value == false>::type> + explicit simdjson_inline operator T() noexcept(false); + simdjson_inline operator SIMDJSON_IMPLEMENTATION::ondemand::array() & noexcept(false); + simdjson_inline operator SIMDJSON_IMPLEMENTATION::ondemand::object() & noexcept(false); + simdjson_inline operator uint64_t() noexcept(false); + simdjson_inline operator int64_t() noexcept(false); + simdjson_inline operator double() noexcept(false); + simdjson_inline operator std::string_view() noexcept(false); + simdjson_inline operator SIMDJSON_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false); + simdjson_inline operator bool() noexcept(false); + simdjson_inline operator SIMDJSON_IMPLEMENTATION::ondemand::value() noexcept(false); +#endif + simdjson_inline simdjson_result count_elements() & noexcept; + simdjson_inline simdjson_result count_fields() & noexcept; + simdjson_inline simdjson_result at(size_t index) & noexcept; + simdjson_inline simdjson_result begin() & noexcept; + simdjson_inline simdjson_result end() & noexcept; + simdjson_inline simdjson_result find_field(std::string_view key) & noexcept; + simdjson_inline simdjson_result find_field(const char *key) & noexcept; + simdjson_inline simdjson_result operator[](std::string_view key) & noexcept; + simdjson_inline simdjson_result operator[](const char *key) & noexcept; + simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; + simdjson_inline simdjson_result find_field_unordered(const char *key) & noexcept; + simdjson_inline simdjson_result type() noexcept; + simdjson_inline simdjson_result is_scalar() noexcept; + simdjson_inline simdjson_result is_string() noexcept; + simdjson_inline simdjson_result current_location() noexcept; + simdjson_inline simdjson_result current_depth() const noexcept; + simdjson_inline simdjson_result is_negative() noexcept; + simdjson_inline simdjson_result is_integer() noexcept; + simdjson_inline simdjson_result get_number_type() noexcept; + simdjson_inline simdjson_result get_number() noexcept; + /** @copydoc simdjson_inline std::string_view document_reference::raw_json_token() const noexcept */ + simdjson_inline simdjson_result raw_json_token() noexcept; + + simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + simdjson_inline simdjson_result at_path(std::string_view json_path) noexcept; +}; + + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_H diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/document_stream-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/document_stream-inl.h new file mode 100644 index 000000000000..059ad0ec5f4b --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/document_stream-inl.h @@ -0,0 +1,433 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_STREAM_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_STREAM_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/document_stream.h" +#include "simdjson/generic/ondemand/document-inl.h" +#include "simdjson/generic/implementation_simdjson_result_base-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include +#include + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +#ifdef SIMDJSON_THREADS_ENABLED + +inline void stage1_worker::finish() { + // After calling "run" someone would call finish() to wait + // for the end of the processing. + // This function will wait until either the thread has done + // the processing or, else, the destructor has been called. + std::unique_lock lock(locking_mutex); + cond_var.wait(lock, [this]{return has_work == false;}); +} + +inline stage1_worker::~stage1_worker() { + // The thread may never outlive the stage1_worker instance + // and will always be stopped/joined before the stage1_worker + // instance is gone. + stop_thread(); +} + +inline void stage1_worker::start_thread() { + std::unique_lock lock(locking_mutex); + if(thread.joinable()) { + return; // This should never happen but we never want to create more than one thread. + } + thread = std::thread([this]{ + while(true) { + std::unique_lock thread_lock(locking_mutex); + // We wait for either "run" or "stop_thread" to be called. + cond_var.wait(thread_lock, [this]{return has_work || !can_work;}); + // If, for some reason, the stop_thread() method was called (i.e., the + // destructor of stage1_worker is called, then we want to immediately destroy + // the thread (and not do any more processing). + if(!can_work) { + break; + } + this->owner->stage1_thread_error = this->owner->run_stage1(*this->stage1_thread_parser, + this->_next_batch_start); + this->has_work = false; + // The condition variable call should be moved after thread_lock.unlock() for performance + // reasons but thread sanitizers may report it as a data race if we do. + // See https://stackoverflow.com/questions/35775501/c-should-condition-variable-be-notified-under-lock + cond_var.notify_one(); // will notify "finish" + thread_lock.unlock(); + } + } + ); +} + + +inline void stage1_worker::stop_thread() { + std::unique_lock lock(locking_mutex); + // We have to make sure that all locks can be released. + can_work = false; + has_work = false; + cond_var.notify_all(); + lock.unlock(); + if(thread.joinable()) { + thread.join(); + } +} + +inline void stage1_worker::run(document_stream * ds, parser * stage1, size_t next_batch_start) { + std::unique_lock lock(locking_mutex); + owner = ds; + _next_batch_start = next_batch_start; + stage1_thread_parser = stage1; + has_work = true; + // The condition variable call should be moved after thread_lock.unlock() for performance + // reasons but thread sanitizers may report it as a data race if we do. + // See https://stackoverflow.com/questions/35775501/c-should-condition-variable-be-notified-under-lock + cond_var.notify_one(); // will notify the thread lock that we have work + lock.unlock(); +} + +#endif // SIMDJSON_THREADS_ENABLED + +simdjson_inline document_stream::document_stream( + ondemand::parser &_parser, + const uint8_t *_buf, + size_t _len, + size_t _batch_size, + bool _allow_comma_separated +) noexcept + : parser{&_parser}, + buf{_buf}, + len{_len}, + batch_size{_batch_size <= MINIMAL_BATCH_SIZE ? MINIMAL_BATCH_SIZE : _batch_size}, + allow_comma_separated{_allow_comma_separated}, + error{SUCCESS} + #ifdef SIMDJSON_THREADS_ENABLED + , use_thread(_parser.threaded) // we need to make a copy because _parser.threaded can change + #endif +{ +#ifdef SIMDJSON_THREADS_ENABLED + if(worker.get() == nullptr) { + error = MEMALLOC; + } +#endif +} + +simdjson_inline document_stream::document_stream() noexcept + : parser{nullptr}, + buf{nullptr}, + len{0}, + batch_size{0}, + allow_comma_separated{false}, + error{UNINITIALIZED} + #ifdef SIMDJSON_THREADS_ENABLED + , use_thread(false) + #endif +{ +} + +simdjson_inline document_stream::~document_stream() noexcept +{ + #ifdef SIMDJSON_THREADS_ENABLED + worker.reset(); + #endif +} + +inline size_t document_stream::size_in_bytes() const noexcept { + return len; +} + +inline size_t document_stream::truncated_bytes() const noexcept { + if(error == CAPACITY) { return len - batch_start; } + return parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] - parser->implementation->structural_indexes[parser->implementation->n_structural_indexes + 1]; +} + +simdjson_inline document_stream::iterator::iterator() noexcept + : stream{nullptr}, finished{true} { +} + +simdjson_inline document_stream::iterator::iterator(document_stream* _stream, bool is_end) noexcept + : stream{_stream}, finished{is_end} { +} + +simdjson_inline simdjson_result document_stream::iterator::operator*() noexcept { + //if(stream->error) { return stream->error; } + return simdjson_result(stream->doc, stream->error); +} + +simdjson_inline document_stream::iterator& document_stream::iterator::operator++() noexcept { + // If there is an error, then we want the iterator + // to be finished, no matter what. (E.g., we do not + // keep generating documents with errors, or go beyond + // a document with errors.) + // + // Users do not have to call "operator*()" when they use operator++, + // so we need to end the stream in the operator++ function. + // + // Note that setting finished = true is essential otherwise + // we would enter an infinite loop. + if (stream->error) { finished = true; } + // Note that stream->error() is guarded against error conditions + // (it will immediately return if stream->error casts to false). + // In effect, this next function does nothing when (stream->error) + // is true (hence the risk of an infinite loop). + stream->next(); + // If that was the last document, we're finished. + // It is the only type of error we do not want to appear + // in operator*. + if (stream->error == EMPTY) { finished = true; } + // If we had any other kind of error (not EMPTY) then we want + // to pass it along to the operator* and we cannot mark the result + // as "finished" just yet. + return *this; +} + +simdjson_inline bool document_stream::iterator::operator!=(const document_stream::iterator &other) const noexcept { + return finished != other.finished; +} + +simdjson_inline document_stream::iterator document_stream::begin() noexcept { + start(); + // If there are no documents, we're finished. + return iterator(this, error == EMPTY); +} + +simdjson_inline document_stream::iterator document_stream::end() noexcept { + return iterator(this, true); +} + +inline void document_stream::start() noexcept { + if (error) { return; } + error = parser->allocate(batch_size); + if (error) { return; } + // Always run the first stage 1 parse immediately + batch_start = 0; + error = run_stage1(*parser, batch_start); + while(error == EMPTY) { + // In exceptional cases, we may start with an empty block + batch_start = next_batch_start(); + if (batch_start >= len) { return; } + error = run_stage1(*parser, batch_start); + } + if (error) { return; } + doc_index = batch_start; + doc = document(json_iterator(&buf[batch_start], parser)); + doc.iter._streaming = true; + + #ifdef SIMDJSON_THREADS_ENABLED + if (use_thread && next_batch_start() < len) { + // Kick off the first thread on next batch if needed + error = stage1_thread_parser.allocate(batch_size); + if (error) { return; } + worker->start_thread(); + start_stage1_thread(); + if (error) { return; } + } + #endif // SIMDJSON_THREADS_ENABLED +} + +inline void document_stream::next() noexcept { + // We always enter at once once in an error condition. + if (error) { return; } + next_document(); + if (error) { return; } + auto cur_struct_index = doc.iter._root - parser->implementation->structural_indexes.get(); + doc_index = batch_start + parser->implementation->structural_indexes[cur_struct_index]; + + // Check if at end of structural indexes (i.e. at end of batch) + if(cur_struct_index >= static_cast(parser->implementation->n_structural_indexes)) { + error = EMPTY; + // Load another batch (if available) + while (error == EMPTY) { + batch_start = next_batch_start(); + if (batch_start >= len) { break; } + #ifdef SIMDJSON_THREADS_ENABLED + if(use_thread) { + load_from_stage1_thread(); + } else { + error = run_stage1(*parser, batch_start); + } + #else + error = run_stage1(*parser, batch_start); + #endif + /** + * Whenever we move to another window, we need to update all pointers to make + * it appear as if the input buffer started at the beginning of the window. + * + * Take this input: + * + * {"z":5} {"1":1,"2":2,"4":4} [7, 10, 9] [15, 11, 12, 13] [154, 110, 112, 1311] + * + * Say you process the following window... + * + * '{"z":5} {"1":1,"2":2,"4":4} [7, 10, 9]' + * + * When you do so, the json_iterator has a pointer at the beginning of the memory region + * (pointing at the beginning of '{"z"...'. + * + * When you move to the window that starts at... + * + * '[7, 10, 9] [15, 11, 12, 13] ... + * + * then it is not sufficient to just run stage 1. You also need to re-anchor the + * json_iterator so that it believes we are starting at '[7, 10, 9]...'. + * + * Under the DOM front-end, this gets done automatically because the parser owns + * the pointer the data, and when you call stage1 and then stage2 on the same + * parser, then stage2 will run on the pointer acquired by stage1. + * + * That is, stage1 calls "this->buf = _buf" so the parser remembers the buffer that + * we used. But json_iterator has no callback when stage1 is called on the parser. + * In fact, I think that the parser is unaware of json_iterator. + * + * + * So we need to re-anchor the json_iterator after each call to stage 1 so that + * all of the pointers are in sync. + */ + doc.iter = json_iterator(&buf[batch_start], parser); + doc.iter._streaming = true; + /** + * End of resync. + */ + + if (error) { continue; } // If the error was EMPTY, we may want to load another batch. + doc_index = batch_start; + } + } +} + +inline void document_stream::next_document() noexcept { + // Go to next place where depth=0 (document depth) + error = doc.iter.skip_child(0); + if (error) { return; } + // Always set depth=1 at the start of document + doc.iter._depth = 1; + // consume comma if comma separated is allowed + if (allow_comma_separated) { doc.iter.consume_character(','); } + // Resets the string buffer at the beginning, thus invalidating the strings. + doc.iter._string_buf_loc = parser->string_buf.get(); + doc.iter._root = doc.iter.position(); +} + +inline size_t document_stream::next_batch_start() const noexcept { + return batch_start + parser->implementation->structural_indexes[parser->implementation->n_structural_indexes]; +} + +inline error_code document_stream::run_stage1(ondemand::parser &p, size_t _batch_start) noexcept { + // This code only updates the structural index in the parser, it does not update any json_iterator + // instance. + size_t remaining = len - _batch_start; + if (remaining <= batch_size) { + return p.implementation->stage1(&buf[_batch_start], remaining, stage1_mode::streaming_final); + } else { + return p.implementation->stage1(&buf[_batch_start], batch_size, stage1_mode::streaming_partial); + } +} + +simdjson_inline size_t document_stream::iterator::current_index() const noexcept { + return stream->doc_index; +} + +simdjson_inline std::string_view document_stream::iterator::source() const noexcept { + auto depth = stream->doc.iter.depth(); + auto cur_struct_index = stream->doc.iter._root - stream->parser->implementation->structural_indexes.get(); + + // If at root, process the first token to determine if scalar value + if (stream->doc.iter.at_root()) { + switch (stream->buf[stream->batch_start + stream->parser->implementation->structural_indexes[cur_struct_index]]) { + case '{': case '[': // Depth=1 already at start of document + break; + case '}': case ']': + depth--; + break; + default: // Scalar value document + // TODO: We could remove trailing whitespaces + // This returns a string spanning from start of value to the beginning of the next document (excluded) + { + auto next_index = stream->parser->implementation->structural_indexes[++cur_struct_index]; + // normally the length would be next_index - current_index() - 1, except for the last document + size_t svlen = next_index - current_index(); + const char *start = reinterpret_cast(stream->buf) + current_index(); + while(svlen > 1 && (std::isspace(start[svlen-1]) || start[svlen-1] == '\0')) { + svlen--; + } + return std::string_view(start, svlen); + } + } + cur_struct_index++; + } + + while (cur_struct_index <= static_cast(stream->parser->implementation->n_structural_indexes)) { + switch (stream->buf[stream->batch_start + stream->parser->implementation->structural_indexes[cur_struct_index]]) { + case '{': case '[': + depth++; + break; + case '}': case ']': + depth--; + break; + } + if (depth == 0) { break; } + cur_struct_index++; + } + + return std::string_view(reinterpret_cast(stream->buf) + current_index(), stream->parser->implementation->structural_indexes[cur_struct_index] - current_index() + stream->batch_start + 1);; +} + +inline error_code document_stream::iterator::error() const noexcept { + return stream->error; +} + +#ifdef SIMDJSON_THREADS_ENABLED + +inline void document_stream::load_from_stage1_thread() noexcept { + worker->finish(); + // Swap to the parser that was loaded up in the thread. Make sure the parser has + // enough memory to swap to, as well. + std::swap(stage1_thread_parser,*parser); + error = stage1_thread_error; + if (error) { return; } + + // If there's anything left, start the stage 1 thread! + if (next_batch_start() < len) { + start_stage1_thread(); + } +} + +inline void document_stream::start_stage1_thread() noexcept { + // we call the thread on a lambda that will update + // this->stage1_thread_error + // there is only one thread that may write to this value + // TODO this is NOT exception-safe. + this->stage1_thread_error = UNINITIALIZED; // In case something goes wrong, make sure it's an error + size_t _next_batch_start = this->next_batch_start(); + + worker->run(this, & this->stage1_thread_parser, _next_batch_start); +} + +#endif // SIMDJSON_THREADS_ENABLED + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result( + error_code error +) noexcept : + implementation_simdjson_result_base(error) +{ +} +simdjson_inline simdjson_result::simdjson_result( + SIMDJSON_IMPLEMENTATION::ondemand::document_stream &&value +) noexcept : + implementation_simdjson_result_base( + std::forward(value) + ) +{ +} + +} + +#endif // SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_STREAM_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/document_stream.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/document_stream.h new file mode 100644 index 000000000000..ef1265fb3386 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/document_stream.h @@ -0,0 +1,337 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_STREAM_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_STREAM_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#include "simdjson/generic/ondemand/document.h" +#include "simdjson/generic/ondemand/parser.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#ifdef SIMDJSON_THREADS_ENABLED +#include +#include +#include +#endif + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +#ifdef SIMDJSON_THREADS_ENABLED +/** @private Custom worker class **/ +struct stage1_worker { + stage1_worker() noexcept = default; + stage1_worker(const stage1_worker&) = delete; + stage1_worker(stage1_worker&&) = delete; + stage1_worker operator=(const stage1_worker&) = delete; + ~stage1_worker(); + /** + * We only start the thread when it is needed, not at object construction, this may throw. + * You should only call this once. + **/ + void start_thread(); + /** + * Start a stage 1 job. You should first call 'run', then 'finish'. + * You must call start_thread once before. + */ + void run(document_stream * ds, parser * stage1, size_t next_batch_start); + /** Wait for the run to finish (blocking). You should first call 'run', then 'finish'. **/ + void finish(); + +private: + + /** + * Normally, we would never stop the thread. But we do in the destructor. + * This function is only safe assuming that you are not waiting for results. You + * should have called run, then finish, and be done. + **/ + void stop_thread(); + + std::thread thread{}; + /** These three variables define the work done by the thread. **/ + ondemand::parser * stage1_thread_parser{}; + size_t _next_batch_start{}; + document_stream * owner{}; + /** + * We have two state variables. This could be streamlined to one variable in the future but + * we use two for clarity. + */ + bool has_work{false}; + bool can_work{true}; + + /** + * We lock using a mutex. + */ + std::mutex locking_mutex{}; + std::condition_variable cond_var{}; + + friend class document_stream; +}; +#endif // SIMDJSON_THREADS_ENABLED + +/** + * A forward-only stream of documents. + * + * Produced by parser::iterate_many. + * + */ +class document_stream { +public: + /** + * Construct an uninitialized document_stream. + * + * ```c++ + * document_stream docs; + * auto error = parser.iterate_many(json).get(docs); + * ``` + */ + simdjson_inline document_stream() noexcept; + /** Move one document_stream to another. */ + simdjson_inline document_stream(document_stream &&other) noexcept = default; + /** Move one document_stream to another. */ + simdjson_inline document_stream &operator=(document_stream &&other) noexcept = default; + + simdjson_inline ~document_stream() noexcept; + + /** + * Returns the input size in bytes. + */ + inline size_t size_in_bytes() const noexcept; + + /** + * After iterating through the stream, this method + * returns the number of bytes that were not parsed at the end + * of the stream. If truncated_bytes() differs from zero, + * then the input was truncated maybe because incomplete JSON + * documents were found at the end of the stream. You + * may need to process the bytes in the interval [size_in_bytes()-truncated_bytes(), size_in_bytes()). + * + * You should only call truncated_bytes() after streaming through all + * documents, like so: + * + * document_stream stream = parser.iterate_many(json,window); + * for(auto & doc : stream) { + * // do something with doc + * } + * size_t truncated = stream.truncated_bytes(); + * + */ + inline size_t truncated_bytes() const noexcept; + + class iterator { + public: + using value_type = simdjson_result; + using reference = simdjson_result; + using pointer = void; + using difference_type = std::ptrdiff_t; + using iterator_category = std::input_iterator_tag; + + /** + * Default constructor. + */ + simdjson_inline iterator() noexcept; + /** + * Get the current document (or error). + */ + simdjson_inline reference operator*() noexcept; + /** + * Advance to the next document (prefix). + */ + inline iterator& operator++() noexcept; + /** + * Check if we're at the end yet. + * @param other the end iterator to compare to. + */ + simdjson_inline bool operator!=(const iterator &other) const noexcept; + /** + * @private + * + * Gives the current index in the input document in bytes. + * + * document_stream stream = parser.parse_many(json,window); + * for(auto i = stream.begin(); i != stream.end(); ++i) { + * auto doc = *i; + * size_t index = i.current_index(); + * } + * + * This function (current_index()) is experimental and the usage + * may change in future versions of simdjson: we find the API somewhat + * awkward and we would like to offer something friendlier. + */ + simdjson_inline size_t current_index() const noexcept; + + /** + * @private + * + * Gives a view of the current document at the current position. + * + * document_stream stream = parser.iterate_many(json,window); + * for(auto i = stream.begin(); i != stream.end(); ++i) { + * std::string_view v = i.source(); + * } + * + * The returned string_view instance is simply a map to the (unparsed) + * source string: it may thus include white-space characters and all manner + * of padding. + * + * This function (source()) is experimental and the usage + * may change in future versions of simdjson: we find the API somewhat + * awkward and we would like to offer something friendlier. + * + */ + simdjson_inline std::string_view source() const noexcept; + + /** + * Returns error of the stream (if any). + */ + inline error_code error() const noexcept; + + private: + simdjson_inline iterator(document_stream *s, bool finished) noexcept; + /** The document_stream we're iterating through. */ + document_stream* stream; + /** Whether we're finished or not. */ + bool finished; + + friend class document; + friend class document_stream; + friend class json_iterator; + }; + + /** + * Start iterating the documents in the stream. + */ + simdjson_inline iterator begin() noexcept; + /** + * The end of the stream, for iterator comparison purposes. + */ + simdjson_inline iterator end() noexcept; + +private: + + document_stream &operator=(const document_stream &) = delete; // Disallow copying + document_stream(const document_stream &other) = delete; // Disallow copying + + /** + * Construct a document_stream. Does not allocate or parse anything until the iterator is + * used. + * + * @param parser is a reference to the parser instance used to generate this document_stream + * @param buf is the raw byte buffer we need to process + * @param len is the length of the raw byte buffer in bytes + * @param batch_size is the size of the windows (must be strictly greater or equal to the largest JSON document) + */ + simdjson_inline document_stream( + ondemand::parser &parser, + const uint8_t *buf, + size_t len, + size_t batch_size, + bool allow_comma_separated + ) noexcept; + + /** + * Parse the first document in the buffer. Used by begin(), to handle allocation and + * initialization. + */ + inline void start() noexcept; + + /** + * Parse the next document found in the buffer previously given to document_stream. + * + * The content should be a valid JSON document encoded as UTF-8. If there is a + * UTF-8 BOM, the parser skips it. + * + * You do NOT need to pre-allocate a parser. This function takes care of + * pre-allocating a capacity defined by the batch_size defined when creating the + * document_stream object. + * + * The function returns simdjson::EMPTY if there is no more data to be parsed. + * + * The function returns simdjson::SUCCESS (as integer = 0) in case of success + * and indicates that the buffer has successfully been parsed to the end. + * Every document it contained has been parsed without error. + * + * The function returns an error code from simdjson/simdjson.h in case of failure + * such as simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth; + * the simdjson::error_message function converts these error codes into a string). + * + * You can also check validity by calling parser.is_valid(). The same parser can + * and should be reused for the other documents in the buffer. + */ + inline void next() noexcept; + + /** Move the json_iterator of the document to the location of the next document in the stream. */ + inline void next_document() noexcept; + + /** Get the next document index. */ + inline size_t next_batch_start() const noexcept; + + /** Pass the next batch through stage 1 with the given parser. */ + inline error_code run_stage1(ondemand::parser &p, size_t batch_start) noexcept; + + // Fields + ondemand::parser *parser; + const uint8_t *buf; + size_t len; + size_t batch_size; + bool allow_comma_separated; + /** + * We are going to use just one document instance. The document owns + * the json_iterator. It implies that we only ever pass a reference + * to the document to the users. + */ + document doc{}; + /** The error (or lack thereof) from the current document. */ + error_code error; + size_t batch_start{0}; + size_t doc_index{}; + + #ifdef SIMDJSON_THREADS_ENABLED + /** Indicates whether we use threads. Note that this needs to be a constant during the execution of the parsing. */ + bool use_thread; + + inline void load_from_stage1_thread() noexcept; + + /** Start a thread to run stage 1 on the next batch. */ + inline void start_stage1_thread() noexcept; + + /** Wait for the stage 1 thread to finish and capture the results. */ + inline void finish_stage1_thread() noexcept; + + /** The error returned from the stage 1 thread. */ + error_code stage1_thread_error{UNINITIALIZED}; + /** The thread used to run stage 1 against the next batch in the background. */ + std::unique_ptr worker{new(std::nothrow) stage1_worker()}; + /** + * The parser used to run stage 1 in the background. Will be swapped + * with the regular parser when finished. + */ + ondemand::parser stage1_thread_parser{}; + + friend struct stage1_worker; + #endif // SIMDJSON_THREADS_ENABLED + + friend class parser; + friend class document; + friend class json_iterator; + friend struct simdjson_result; + friend struct simdjson::internal::simdjson_result_base; +}; // document_stream + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::document_stream &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_DOCUMENT_STREAM_H diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/field-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/field-inl.h new file mode 100644 index 000000000000..ae5189536f8e --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/field-inl.h @@ -0,0 +1,129 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_FIELD_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_FIELD_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/field.h" +#include "simdjson/generic/ondemand/value-inl.h" +#include "simdjson/generic/ondemand/value_iterator-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +// clang 6 does not think the default constructor can be noexcept, so we make it explicit +simdjson_inline field::field() noexcept : std::pair() {} + +simdjson_inline field::field(raw_json_string key, ondemand::value &&value) noexcept + : std::pair(key, std::forward(value)) +{ +} + +simdjson_inline simdjson_result field::start(value_iterator &parent_iter) noexcept { + raw_json_string key; + SIMDJSON_TRY( parent_iter.field_key().get(key) ); + SIMDJSON_TRY( parent_iter.field_value() ); + return field::start(parent_iter, key); +} + +simdjson_inline simdjson_result field::start(const value_iterator &parent_iter, raw_json_string key) noexcept { + return field(key, parent_iter.child()); +} + +simdjson_inline simdjson_warn_unused simdjson_result field::unescaped_key(bool allow_replacement) noexcept { + SIMDJSON_ASSUME(first.buf != nullptr); // We would like to call .alive() but Visual Studio won't let us. + simdjson_result answer = first.unescape(second.iter.json_iter(), allow_replacement); + first.consume(); + return answer; +} + +template +simdjson_inline simdjson_warn_unused error_code field::unescaped_key(string_type& receiver, bool allow_replacement) noexcept { + std::string_view key; + SIMDJSON_TRY( unescaped_key(allow_replacement).get(key) ); + receiver = key; + return SUCCESS; +} + +simdjson_inline raw_json_string field::key() const noexcept { + SIMDJSON_ASSUME(first.buf != nullptr); // We would like to call .alive() by Visual Studio won't let us. + return first; +} + + +simdjson_inline std::string_view field::key_raw_json_token() const noexcept { + SIMDJSON_ASSUME(first.buf != nullptr); // We would like to call .alive() by Visual Studio won't let us. + return std::string_view(reinterpret_cast(first.buf-1), second.iter._json_iter->token.peek(-1) - first.buf + 1); +} + +simdjson_inline std::string_view field::escaped_key() const noexcept { + SIMDJSON_ASSUME(first.buf != nullptr); // We would like to call .alive() by Visual Studio won't let us. + auto end_quote = second.iter._json_iter->token.peek(-1); + while(*end_quote != '"') end_quote--; + return std::string_view(reinterpret_cast(first.buf), end_quote - first.buf); +} + +simdjson_inline value &field::value() & noexcept { + return second; +} + +simdjson_inline value field::value() && noexcept { + return std::forward(*this).second; +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result( + SIMDJSON_IMPLEMENTATION::ondemand::field &&value +) noexcept : + implementation_simdjson_result_base( + std::forward(value) + ) +{ +} +simdjson_inline simdjson_result::simdjson_result( + error_code error +) noexcept : + implementation_simdjson_result_base(error) +{ +} + +simdjson_inline simdjson_result simdjson_result::key() noexcept { + if (error()) { return error(); } + return first.key(); +} + +simdjson_inline simdjson_result simdjson_result::key_raw_json_token() noexcept { + if (error()) { return error(); } + return first.key_raw_json_token(); +} + +simdjson_inline simdjson_result simdjson_result::escaped_key() noexcept { + if (error()) { return error(); } + return first.escaped_key(); +} + +simdjson_inline simdjson_result simdjson_result::unescaped_key(bool allow_replacement) noexcept { + if (error()) { return error(); } + return first.unescaped_key(allow_replacement); +} + +template +simdjson_inline error_code simdjson_result::unescaped_key(string_type &receiver, bool allow_replacement) noexcept { + if (error()) { return error(); } + return first.unescaped_key(receiver, allow_replacement); +} + +simdjson_inline simdjson_result simdjson_result::value() noexcept { + if (error()) { return error(); } + return std::move(first.value()); +} + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_FIELD_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/field.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/field.h new file mode 100644 index 000000000000..71344362f9ef --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/field.h @@ -0,0 +1,113 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_FIELD_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_FIELD_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#include "simdjson/generic/ondemand/raw_json_string.h" +#include "simdjson/generic/ondemand/value.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * A JSON field (key/value pair) in an object. + * + * Returned from object iteration. + * + * Extends from std::pair so you can use C++ algorithms that rely on pairs. + */ +class field : public std::pair { +public: + /** + * Create a new invalid field. + * + * Exists so you can declare a variable and later assign to it before use. + */ + simdjson_inline field() noexcept; + + /** + * Get the key as a string_view (for higher speed, consider raw_key). + * We deliberately use a more cumbersome name (unescaped_key) to force users + * to think twice about using it. + * + * This consumes the key: once you have called unescaped_key(), you cannot + * call it again nor can you call key(). + */ + simdjson_inline simdjson_warn_unused simdjson_result unescaped_key(bool allow_replacement = false) noexcept; + /** + * Get the key as a string_view (for higher speed, consider raw_key). + * We deliberately use a more cumbersome name (unescaped_key) to force users + * to think twice about using it. The content is stored in the receiver. + * + * This consumes the key: once you have called unescaped_key(), you cannot + * call it again nor can you call key(). + */ + template + simdjson_inline simdjson_warn_unused error_code unescaped_key(string_type& receiver, bool allow_replacement = false) noexcept; + /** + * Get the key as a raw_json_string. Can be used for direct comparison with + * an unescaped C string: e.g., key() == "test". This does not count as + * consumption of the content: you can safely call it repeatedly. + * See escaped_key() for a similar function which returns + * a more convenient std::string_view result. + */ + simdjson_inline raw_json_string key() const noexcept; + /** + * Get the unprocessed key as a string_view. This includes the quotes and may include + * some spaces after the last quote. This does not count as + * consumption of the content: you can safely call it repeatedly. + * See escaped_key(). + */ + simdjson_inline std::string_view key_raw_json_token() const noexcept; + /** + * Get the key as a string_view. This does not include the quotes and + * the string is unprocessed key so it may contain escape characters + * (e.g., \uXXXX or \n). It does not count as a consumption of the content: + * you can safely call it repeatedly. Use unescaped_key() to get the unescaped key. + */ + simdjson_inline std::string_view escaped_key() const noexcept; + /** + * Get the field value. + */ + simdjson_inline ondemand::value &value() & noexcept; + /** + * @overload ondemand::value &ondemand::value() & noexcept + */ + simdjson_inline ondemand::value value() && noexcept; + +protected: + simdjson_inline field(raw_json_string key, ondemand::value &&value) noexcept; + static simdjson_inline simdjson_result start(value_iterator &parent_iter) noexcept; + static simdjson_inline simdjson_result start(const value_iterator &parent_iter, raw_json_string key) noexcept; + friend struct simdjson_result; + friend class object_iterator; +}; + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::field &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; + + simdjson_inline simdjson_result unescaped_key(bool allow_replacement = false) noexcept; + template + simdjson_inline error_code unescaped_key(string_type &receiver, bool allow_replacement = false) noexcept; + simdjson_inline simdjson_result key() noexcept; + simdjson_inline simdjson_result key_raw_json_token() noexcept; + simdjson_inline simdjson_result escaped_key() noexcept; + simdjson_inline simdjson_result value() noexcept; +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_FIELD_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_iterator-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_iterator-inl.h new file mode 100644 index 000000000000..6a054af813d5 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_iterator-inl.h @@ -0,0 +1,444 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_JSON_ITERATOR_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_JSON_ITERATOR_INL_H +#include "simdjson/internal/dom_parser_implementation.h" +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/json_iterator.h" +#include "simdjson/generic/ondemand/parser.h" +#include "simdjson/generic/ondemand/raw_json_string.h" +#include "simdjson/generic/ondemand/logger-inl.h" +#include "simdjson/generic/ondemand/parser-inl.h" +#include "simdjson/generic/ondemand/token_iterator-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +simdjson_inline json_iterator::json_iterator(json_iterator &&other) noexcept + : token(std::forward(other.token)), + parser{other.parser}, + _string_buf_loc{other._string_buf_loc}, + error{other.error}, + _depth{other._depth}, + _root{other._root}, + _streaming{other._streaming} +{ + other.parser = nullptr; +} +simdjson_inline json_iterator &json_iterator::operator=(json_iterator &&other) noexcept { + token = other.token; + parser = other.parser; + _string_buf_loc = other._string_buf_loc; + error = other.error; + _depth = other._depth; + _root = other._root; + _streaming = other._streaming; + other.parser = nullptr; + return *this; +} + +simdjson_inline json_iterator::json_iterator(const uint8_t *buf, ondemand::parser *_parser) noexcept + : token(buf, &_parser->implementation->structural_indexes[0]), + parser{_parser}, + _string_buf_loc{parser->string_buf.get()}, + _depth{1}, + _root{parser->implementation->structural_indexes.get()}, + _streaming{false} + +{ + logger::log_headers(); +#if SIMDJSON_CHECK_EOF + assert_more_tokens(); +#endif +} + +#ifdef SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON +simdjson_inline json_iterator::json_iterator(const uint8_t *buf, ondemand::parser *_parser, bool streaming) noexcept + : token(buf, &_parser->implementation->structural_indexes[0]), + parser{_parser}, + _string_buf_loc{parser->string_buf.get()}, + _depth{1}, + _root{parser->implementation->structural_indexes.get()}, + _streaming{streaming} + +{ + logger::log_headers(); +#if SIMDJSON_CHECK_EOF + assert_more_tokens(); +#endif +} +#endif // SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON + +inline void json_iterator::rewind() noexcept { + token.set_position( root_position() ); + logger::log_headers(); // We start again + _string_buf_loc = parser->string_buf.get(); + _depth = 1; +} + +inline bool json_iterator::balanced() const noexcept { + token_iterator ti(token); + int32_t count{0}; + ti.set_position( root_position() ); + while(ti.peek() <= peek_last()) { + switch (*ti.return_current_and_advance()) + { + case '[': case '{': + count++; + break; + case ']': case '}': + count--; + break; + default: + break; + } + } + return count == 0; +} + + +// GCC 7 warns when the first line of this function is inlined away into oblivion due to the caller +// relating depth and parent_depth, which is a desired effect. The warning does not show up if the +// skip_child() function is not marked inline). +SIMDJSON_PUSH_DISABLE_WARNINGS +SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING +simdjson_warn_unused simdjson_inline error_code json_iterator::skip_child(depth_t parent_depth) noexcept { + if (depth() <= parent_depth) { return SUCCESS; } + switch (*return_current_and_advance()) { + // TODO consider whether matching braces is a requirement: if non-matching braces indicates + // *missing* braces, then future lookups are not in the object/arrays they think they are, + // violating the rule "validate enough structure that the user can be confident they are + // looking at the right values." + // PERF TODO we can eliminate the switch here with a lookup of how much to add to depth + + // For the first open array/object in a value, we've already incremented depth, so keep it the same + // We never stop at colon, but if we did, it wouldn't affect depth + case '[': case '{': case ':': + logger::log_start_value(*this, "skip"); + break; + // If there is a comma, we have just finished a value in an array/object, and need to get back in + case ',': + logger::log_value(*this, "skip"); + break; + // ] or } means we just finished a value and need to jump out of the array/object + case ']': case '}': + logger::log_end_value(*this, "skip"); + _depth--; + if (depth() <= parent_depth) { return SUCCESS; } +#if SIMDJSON_CHECK_EOF + // If there are no more tokens, the parent is incomplete. + if (at_end()) { return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "Missing [ or { at start"); } +#endif // SIMDJSON_CHECK_EOF + break; + case '"': + if(*peek() == ':') { + // We are at a key!!! + // This might happen if you just started an object and you skip it immediately. + // Performance note: it would be nice to get rid of this check as it is somewhat + // expensive. + // https://github.com/simdjson/simdjson/issues/1742 + logger::log_value(*this, "key"); + return_current_and_advance(); // eat up the ':' + break; // important!!! + } + simdjson_fallthrough; + // Anything else must be a scalar value + default: + // For the first scalar, we will have incremented depth already, so we decrement it here. + logger::log_value(*this, "skip"); + _depth--; + if (depth() <= parent_depth) { return SUCCESS; } + break; + } + + // Now that we've considered the first value, we only increment/decrement for arrays/objects + while (position() < end_position()) { + switch (*return_current_and_advance()) { + case '[': case '{': + logger::log_start_value(*this, "skip"); + _depth++; + break; + // TODO consider whether matching braces is a requirement: if non-matching braces indicates + // *missing* braces, then future lookups are not in the object/arrays they think they are, + // violating the rule "validate enough structure that the user can be confident they are + // looking at the right values." + // PERF TODO we can eliminate the switch here with a lookup of how much to add to depth + case ']': case '}': + logger::log_end_value(*this, "skip"); + _depth--; + if (depth() <= parent_depth) { return SUCCESS; } + break; + default: + logger::log_value(*this, "skip", ""); + break; + } + } + + return report_error(TAPE_ERROR, "not enough close braces"); +} + +SIMDJSON_POP_DISABLE_WARNINGS + +simdjson_inline bool json_iterator::at_root() const noexcept { + return position() == root_position(); +} + +simdjson_inline bool json_iterator::is_single_token() const noexcept { + return parser->implementation->n_structural_indexes == 1; +} + +simdjson_inline bool json_iterator::streaming() const noexcept { + return _streaming; +} + +simdjson_inline token_position json_iterator::root_position() const noexcept { + return _root; +} + +simdjson_inline void json_iterator::assert_at_document_depth() const noexcept { + SIMDJSON_ASSUME( _depth == 1 ); +} + +simdjson_inline void json_iterator::assert_at_root() const noexcept { + SIMDJSON_ASSUME( _depth == 1 ); +#ifndef SIMDJSON_CLANG_VISUAL_STUDIO + // Under Visual Studio, the next SIMDJSON_ASSUME fails with: the argument + // has side effects that will be discarded. + SIMDJSON_ASSUME( token.position() == _root ); +#endif +} + +simdjson_inline void json_iterator::assert_more_tokens(uint32_t required_tokens) const noexcept { + assert_valid_position(token._position + required_tokens - 1); +} + +simdjson_inline void json_iterator::assert_valid_position(token_position position) const noexcept { +#ifndef SIMDJSON_CLANG_VISUAL_STUDIO + SIMDJSON_ASSUME( position >= &parser->implementation->structural_indexes[0] ); + SIMDJSON_ASSUME( position < &parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] ); +#endif +} + +simdjson_inline bool json_iterator::at_end() const noexcept { + return position() == end_position(); +} +simdjson_inline token_position json_iterator::end_position() const noexcept { + uint32_t n_structural_indexes{parser->implementation->n_structural_indexes}; + return &parser->implementation->structural_indexes[n_structural_indexes]; +} + +inline std::string json_iterator::to_string() const noexcept { + if( !is_alive() ) { return "dead json_iterator instance"; } + const char * current_structural = reinterpret_cast(token.peek()); + return std::string("json_iterator [ depth : ") + std::to_string(_depth) + + std::string(", structural : '") + std::string(current_structural,1) + + std::string("', offset : ") + std::to_string(token.current_offset()) + + std::string("', error : ") + error_message(error) + + std::string(" ]"); +} + +inline simdjson_result json_iterator::current_location() const noexcept { + if (!is_alive()) { // Unrecoverable error + if (!at_root()) { + return reinterpret_cast(token.peek(-1)); + } else { + return reinterpret_cast(token.peek()); + } + } + if (at_end()) { + return OUT_OF_BOUNDS; + } + return reinterpret_cast(token.peek()); +} + +simdjson_inline bool json_iterator::is_alive() const noexcept { + return parser; +} + +simdjson_inline void json_iterator::abandon() noexcept { + parser = nullptr; + _depth = 0; +} + +simdjson_inline const uint8_t *json_iterator::return_current_and_advance() noexcept { +#if SIMDJSON_CHECK_EOF + assert_more_tokens(); +#endif // SIMDJSON_CHECK_EOF + return token.return_current_and_advance(); +} + +simdjson_inline const uint8_t *json_iterator::unsafe_pointer() const noexcept { + // deliberately done without safety guard: + return token.peek(); +} + +simdjson_inline const uint8_t *json_iterator::peek(int32_t delta) const noexcept { +#if SIMDJSON_CHECK_EOF + assert_more_tokens(delta+1); +#endif // SIMDJSON_CHECK_EOF + return token.peek(delta); +} + +simdjson_inline uint32_t json_iterator::peek_length(int32_t delta) const noexcept { +#if SIMDJSON_CHECK_EOF + assert_more_tokens(delta+1); +#endif // #if SIMDJSON_CHECK_EOF + return token.peek_length(delta); +} + +simdjson_inline const uint8_t *json_iterator::peek(token_position position) const noexcept { + // todo: currently we require end-of-string buffering, but the following + // assert_valid_position should be turned on if/when we lift that condition. + // assert_valid_position(position); + // This is almost surely related to SIMDJSON_CHECK_EOF but given that SIMDJSON_CHECK_EOF + // is ON by default, we have no choice but to disable it for real with a comment. + return token.peek(position); +} + +simdjson_inline uint32_t json_iterator::peek_length(token_position position) const noexcept { +#if SIMDJSON_CHECK_EOF + assert_valid_position(position); +#endif // SIMDJSON_CHECK_EOF + return token.peek_length(position); +} +simdjson_inline uint32_t json_iterator::peek_root_length(token_position position) const noexcept { +#if SIMDJSON_CHECK_EOF + assert_valid_position(position); +#endif // SIMDJSON_CHECK_EOF + return token.peek_root_length(position); +} + +simdjson_inline token_position json_iterator::last_position() const noexcept { + // The following line fails under some compilers... + // SIMDJSON_ASSUME(parser->implementation->n_structural_indexes > 0); + // since it has side-effects. + uint32_t n_structural_indexes{parser->implementation->n_structural_indexes}; + SIMDJSON_ASSUME(n_structural_indexes > 0); + return &parser->implementation->structural_indexes[n_structural_indexes - 1]; +} +simdjson_inline const uint8_t *json_iterator::peek_last() const noexcept { + return token.peek(last_position()); +} + +simdjson_inline void json_iterator::ascend_to(depth_t parent_depth) noexcept { + SIMDJSON_ASSUME(parent_depth >= 0 && parent_depth < INT32_MAX - 1); + SIMDJSON_ASSUME(_depth == parent_depth + 1); + _depth = parent_depth; +} + +simdjson_inline void json_iterator::descend_to(depth_t child_depth) noexcept { + SIMDJSON_ASSUME(child_depth >= 1 && child_depth < INT32_MAX); + SIMDJSON_ASSUME(_depth == child_depth - 1); + _depth = child_depth; +} + +simdjson_inline depth_t json_iterator::depth() const noexcept { + return _depth; +} + +simdjson_inline uint8_t *&json_iterator::string_buf_loc() noexcept { + return _string_buf_loc; +} + +simdjson_inline error_code json_iterator::report_error(error_code _error, const char *message) noexcept { + SIMDJSON_ASSUME(_error != SUCCESS && _error != UNINITIALIZED && _error != INCORRECT_TYPE && _error != NO_SUCH_FIELD); + logger::log_error(*this, message); + error = _error; + return error; +} + +simdjson_inline token_position json_iterator::position() const noexcept { + return token.position(); +} + +simdjson_inline simdjson_result json_iterator::unescape(raw_json_string in, bool allow_replacement) noexcept { +#if SIMDJSON_DEVELOPMENT_CHECKS + auto result = parser->unescape(in, _string_buf_loc, allow_replacement); + SIMDJSON_ASSUME(!parser->string_buffer_overflow(_string_buf_loc)); + return result; +#else + return parser->unescape(in, _string_buf_loc, allow_replacement); +#endif +} + +simdjson_inline simdjson_result json_iterator::unescape_wobbly(raw_json_string in) noexcept { +#if SIMDJSON_DEVELOPMENT_CHECKS + auto result = parser->unescape_wobbly(in, _string_buf_loc); + SIMDJSON_ASSUME(!parser->string_buffer_overflow(_string_buf_loc)); + return result; +#else + return parser->unescape_wobbly(in, _string_buf_loc); +#endif +} + +simdjson_inline void json_iterator::reenter_child(token_position position, depth_t child_depth) noexcept { + SIMDJSON_ASSUME(child_depth >= 1 && child_depth < INT32_MAX); + SIMDJSON_ASSUME(_depth == child_depth - 1); +#if SIMDJSON_DEVELOPMENT_CHECKS +#ifndef SIMDJSON_CLANG_VISUAL_STUDIO + SIMDJSON_ASSUME(size_t(child_depth) < parser->max_depth()); + SIMDJSON_ASSUME(position >= parser->start_positions[child_depth]); +#endif +#endif + token.set_position(position); + _depth = child_depth; +} + +simdjson_inline error_code json_iterator::consume_character(char c) noexcept { + if (*peek() == c) { + return_current_and_advance(); + return SUCCESS; + } + return TAPE_ERROR; +} + +#if SIMDJSON_DEVELOPMENT_CHECKS + +simdjson_inline token_position json_iterator::start_position(depth_t depth) const noexcept { + SIMDJSON_ASSUME(size_t(depth) < parser->max_depth()); + return size_t(depth) < parser->max_depth() ? parser->start_positions[depth] : 0; +} + +simdjson_inline void json_iterator::set_start_position(depth_t depth, token_position position) noexcept { + SIMDJSON_ASSUME(size_t(depth) < parser->max_depth()); + if(size_t(depth) < parser->max_depth()) { parser->start_positions[depth] = position; } +} + +#endif + + +simdjson_inline error_code json_iterator::optional_error(error_code _error, const char *message) noexcept { + SIMDJSON_ASSUME(_error == INCORRECT_TYPE || _error == NO_SUCH_FIELD); + logger::log_error(*this, message); + return _error; +} + + +simdjson_warn_unused simdjson_inline bool json_iterator::copy_to_buffer(const uint8_t *json, uint32_t max_len, uint8_t *tmpbuf, size_t N) noexcept { + // This function is not expected to be called in performance-sensitive settings. + // Let us guard against silly cases: + if((N < max_len) || (N == 0)) { return false; } + // Copy to the buffer. + std::memcpy(tmpbuf, json, max_len); + if(N > max_len) { // We pad whatever remains with ' '. + std::memset(tmpbuf + max_len, ' ', N - max_len); + } + return true; +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::json_iterator &&value) noexcept + : implementation_simdjson_result_base(std::forward(value)) {} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : implementation_simdjson_result_base(error) {} + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_JSON_ITERATOR_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_iterator.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_iterator.h new file mode 100644 index 000000000000..ee9872cb6c62 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_iterator.h @@ -0,0 +1,338 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_JSON_ITERATOR_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_JSON_ITERATOR_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#include "simdjson/generic/ondemand/token_iterator.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * Iterates through JSON tokens, keeping track of depth and string buffer. + * + * @private This is not intended for external use. + */ +class json_iterator { +protected: + token_iterator token{}; + ondemand::parser *parser{}; + /** + * Next free location in the string buffer. + * + * Used by raw_json_string::unescape() to have a place to unescape strings to. + */ + uint8_t *_string_buf_loc{}; + /** + * JSON error, if there is one. + * + * INCORRECT_TYPE and NO_SUCH_FIELD are *not* stored here, ever. + * + * PERF NOTE: we *hope* this will be elided into control flow, as it is only used (a) in the first + * iteration of the loop, or (b) for the final iteration after a missing comma is found in ++. If + * this is not elided, we should make sure it's at least not using up a register. Failing that, + * we should store it in document so there's only one of them. + */ + error_code error{SUCCESS}; + /** + * Depth of the current token in the JSON. + * + * - 0 = finished with document + * - 1 = document root value (could be [ or {, not yet known) + * - 2 = , or } inside root array/object + * - 3 = key or value inside root array/object. + */ + depth_t _depth{}; + /** + * Beginning of the document indexes. + * Normally we have root == parser->implementation->structural_indexes.get() + * but this may differ, especially in streaming mode (where we have several + * documents); + */ + token_position _root{}; + /** + * Normally, a json_iterator operates over a single document, but in + * some cases, we may have a stream of documents. This attribute is meant + * as meta-data: the json_iterator works the same irrespective of the + * value of this attribute. + */ + bool _streaming{false}; + +public: + simdjson_inline json_iterator() noexcept = default; + simdjson_inline json_iterator(json_iterator &&other) noexcept; + simdjson_inline json_iterator &operator=(json_iterator &&other) noexcept; + simdjson_inline explicit json_iterator(const json_iterator &other) noexcept = default; + simdjson_inline json_iterator &operator=(const json_iterator &other) noexcept = default; + /** + * Skips a JSON value, whether it is a scalar, array or object. + */ + simdjson_warn_unused simdjson_inline error_code skip_child(depth_t parent_depth) noexcept; + + /** + * Tell whether the iterator is still at the start + */ + simdjson_inline bool at_root() const noexcept; + + /** + * Tell whether we should be expected to run in streaming + * mode (iterating over many documents). It is pure metadata + * that does not affect how the iterator works. It is used by + * start_root_array() and start_root_object(). + */ + simdjson_inline bool streaming() const noexcept; + + /** + * Get the root value iterator + */ + simdjson_inline token_position root_position() const noexcept; + /** + * Assert that we are at the document depth (== 1) + */ + simdjson_inline void assert_at_document_depth() const noexcept; + /** + * Assert that we are at the root of the document + */ + simdjson_inline void assert_at_root() const noexcept; + + /** + * Tell whether the iterator is at the EOF mark + */ + simdjson_inline bool at_end() const noexcept; + + /** + * Tell whether the iterator is live (has not been moved). + */ + simdjson_inline bool is_alive() const noexcept; + + /** + * Abandon this iterator, setting depth to 0 (as if the document is finished). + */ + simdjson_inline void abandon() noexcept; + + /** + * Advance the current token without modifying depth. + */ + simdjson_inline const uint8_t *return_current_and_advance() noexcept; + + /** + * Returns true if there is a single token in the index (i.e., it is + * a JSON with a scalar value such as a single number). + * + * @return whether there is a single token + */ + simdjson_inline bool is_single_token() const noexcept; + + /** + * Assert that there are at least the given number of tokens left. + * + * Has no effect in release builds. + */ + simdjson_inline void assert_more_tokens(uint32_t required_tokens=1) const noexcept; + /** + * Assert that the given position addresses an actual token (is within bounds). + * + * Has no effect in release builds. + */ + simdjson_inline void assert_valid_position(token_position position) const noexcept; + /** + * Get the JSON text for a given token (relative). + * + * This is not null-terminated; it is a view into the JSON. + * + * @param delta The relative position of the token to retrieve. e.g. 0 = next token, -1 = prev token. + * + * TODO consider a string_view, assuming the length will get stripped out by the optimizer when + * it is not used ... + */ + simdjson_inline const uint8_t *peek(int32_t delta=0) const noexcept; + /** + * Get the maximum length of the JSON text for the current token (or relative). + * + * The length will include any whitespace at the end of the token. + * + * @param delta The relative position of the token to retrieve. e.g. 0 = next token, -1 = prev token. + */ + simdjson_inline uint32_t peek_length(int32_t delta=0) const noexcept; + /** + * Get a pointer to the current location in the input buffer. + * + * This is not null-terminated; it is a view into the JSON. + * + * You may be pointing outside of the input buffer: it is not generally + * safe to dereference this pointer. + */ + simdjson_inline const uint8_t *unsafe_pointer() const noexcept; + /** + * Get the JSON text for a given token. + * + * This is not null-terminated; it is a view into the JSON. + * + * @param position The position of the token to retrieve. + * + * TODO consider a string_view, assuming the length will get stripped out by the optimizer when + * it is not used ... + */ + simdjson_inline const uint8_t *peek(token_position position) const noexcept; + /** + * Get the maximum length of the JSON text for the current token (or relative). + * + * The length will include any whitespace at the end of the token. + * + * @param position The position of the token to retrieve. + */ + simdjson_inline uint32_t peek_length(token_position position) const noexcept; + /** + * Get the maximum length of the JSON text for the current root token. + * + * The length will include any whitespace at the end of the token. + * + * @param position The position of the token to retrieve. + */ + simdjson_inline uint32_t peek_root_length(token_position position) const noexcept; + /** + * Get the JSON text for the last token in the document. + * + * This is not null-terminated; it is a view into the JSON. + * + * TODO consider a string_view, assuming the length will get stripped out by the optimizer when + * it is not used ... + */ + simdjson_inline const uint8_t *peek_last() const noexcept; + + /** + * Ascend one level. + * + * Validates that the depth - 1 == parent_depth. + * + * @param parent_depth the expected parent depth. + */ + simdjson_inline void ascend_to(depth_t parent_depth) noexcept; + + /** + * Descend one level. + * + * Validates that the new depth == child_depth. + * + * @param child_depth the expected child depth. + */ + simdjson_inline void descend_to(depth_t child_depth) noexcept; + simdjson_inline void descend_to(depth_t child_depth, int32_t delta) noexcept; + + /** + * Get current depth. + */ + simdjson_inline depth_t depth() const noexcept; + + /** + * Get current (writeable) location in the string buffer. + */ + simdjson_inline uint8_t *&string_buf_loc() noexcept; + + /** + * Report an unrecoverable error, preventing further iteration. + * + * @param error The error to report. Must not be SUCCESS, UNINITIALIZED, INCORRECT_TYPE, or NO_SUCH_FIELD. + * @param message An error message to report with the error. + */ + simdjson_inline error_code report_error(error_code error, const char *message) noexcept; + + /** + * Log error, but don't stop iteration. + * @param error The error to report. Must be INCORRECT_TYPE, or NO_SUCH_FIELD. + * @param message An error message to report with the error. + */ + simdjson_inline error_code optional_error(error_code error, const char *message) noexcept; + + /** + * Take an input in json containing max_len characters and attempt to copy it over to tmpbuf, a buffer with + * N bytes of capacity. It will return false if N is too small (smaller than max_len) of if it is zero. + * The buffer (tmpbuf) is padded with space characters. + */ + simdjson_warn_unused simdjson_inline bool copy_to_buffer(const uint8_t *json, uint32_t max_len, uint8_t *tmpbuf, size_t N) noexcept; + + simdjson_inline token_position position() const noexcept; + /** + * Write the raw_json_string to the string buffer and return a string_view. + * Each raw_json_string should be unescaped once, or else the string buffer might + * overflow. + */ + simdjson_inline simdjson_result unescape(raw_json_string in, bool allow_replacement) noexcept; + simdjson_inline simdjson_result unescape_wobbly(raw_json_string in) noexcept; + + simdjson_inline void reenter_child(token_position position, depth_t child_depth) noexcept; + + simdjson_inline error_code consume_character(char c) noexcept; +#if SIMDJSON_DEVELOPMENT_CHECKS + simdjson_inline token_position start_position(depth_t depth) const noexcept; + simdjson_inline void set_start_position(depth_t depth, token_position position) noexcept; +#endif + + /* Useful for debugging and logging purposes. */ + inline std::string to_string() const noexcept; + + /** + * Returns the current location in the document if in bounds. + */ + inline simdjson_result current_location() const noexcept; + + /** + * Updates this json iterator so that it is back at the beginning of the document, + * as if it had just been created. + */ + inline void rewind() noexcept; + /** + * This checks whether the {,},[,] are balanced so that the document + * ends with proper zero depth. This requires scanning the whole document + * and it may be expensive. It is expected that it will be rarely called. + * It does not attempt to match { with } and [ with ]. + */ + inline bool balanced() const noexcept; +protected: + simdjson_inline json_iterator(const uint8_t *buf, ondemand::parser *parser) noexcept; +#ifdef SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON + simdjson_inline json_iterator(const uint8_t *buf, ondemand::parser *parser, bool streaming) noexcept; +#endif // SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON + /// The last token before the end + simdjson_inline token_position last_position() const noexcept; + /// The token *at* the end. This points at gibberish and should only be used for comparison. + simdjson_inline token_position end_position() const noexcept; + /// The end of the buffer. + simdjson_inline token_position end() const noexcept; + + friend class document; + friend class document_stream; + friend class object; + friend class array; + friend class value; + friend class raw_json_string; + friend class parser; + friend class value_iterator; + friend class field; + template + friend simdjson_inline void logger::log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta, logger::log_level level, Args&&... args) noexcept; + template + friend simdjson_inline void logger::log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail, logger::log_level level, Args&&... args) noexcept; +}; // json_iterator + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::json_iterator &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + + simdjson_inline simdjson_result() noexcept = default; +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_JSON_ITERATOR_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_type-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_type-inl.h new file mode 100644 index 000000000000..7fa5eeb30f27 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_type-inl.h @@ -0,0 +1,117 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_JSON_TYPE_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_JSON_TYPE_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/json_type.h" +#include "simdjson/generic/implementation_simdjson_result_base-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +inline std::ostream& operator<<(std::ostream& out, json_type type) noexcept { + switch (type) { + case json_type::array: out << "array"; break; + case json_type::object: out << "object"; break; + case json_type::number: out << "number"; break; + case json_type::string: out << "string"; break; + case json_type::boolean: out << "boolean"; break; + case json_type::null: out << "null"; break; + default: SIMDJSON_UNREACHABLE(); + } + return out; +} + +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson_result &type) noexcept(false) { + return out << type.value(); +} +#endif + + + +simdjson_inline number_type number::get_number_type() const noexcept { + return type; +} + +simdjson_inline bool number::is_uint64() const noexcept { + return get_number_type() == number_type::unsigned_integer; +} + +simdjson_inline uint64_t number::get_uint64() const noexcept { + return payload.unsigned_integer; +} + +simdjson_inline number::operator uint64_t() const noexcept { + return get_uint64(); +} + +simdjson_inline bool number::is_int64() const noexcept { + return get_number_type() == number_type::signed_integer; +} + +simdjson_inline int64_t number::get_int64() const noexcept { + return payload.signed_integer; +} + +simdjson_inline number::operator int64_t() const noexcept { + return get_int64(); +} + +simdjson_inline bool number::is_double() const noexcept { + return get_number_type() == number_type::floating_point_number; +} + +simdjson_inline double number::get_double() const noexcept { + return payload.floating_point_number; +} + +simdjson_inline number::operator double() const noexcept { + return get_double(); +} + +simdjson_inline double number::as_double() const noexcept { + if(is_double()) { + return payload.floating_point_number; + } + if(is_int64()) { + return double(payload.signed_integer); + } + return double(payload.unsigned_integer); +} + +simdjson_inline void number::append_s64(int64_t value) noexcept { + payload.signed_integer = value; + type = number_type::signed_integer; +} + +simdjson_inline void number::append_u64(uint64_t value) noexcept { + payload.unsigned_integer = value; + type = number_type::unsigned_integer; +} + +simdjson_inline void number::append_double(double value) noexcept { + payload.floating_point_number = value; + type = number_type::floating_point_number; +} + +simdjson_inline void number::skip_double() noexcept { + type = number_type::floating_point_number; +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::json_type &&value) noexcept + : implementation_simdjson_result_base(std::forward(value)) {} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : implementation_simdjson_result_base(error) {} + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_JSON_TYPE_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_type.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_type.h new file mode 100644 index 000000000000..b5a970433e8c --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/json_type.h @@ -0,0 +1,160 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_JSON_TYPE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_JSON_TYPE_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#include "simdjson/generic/numberparsing.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * The type of a JSON value. + */ +enum class json_type { + // Start at 1 to catch uninitialized / default values more easily + array=1, ///< A JSON array ( [ 1, 2, 3 ... ] ) + object, ///< A JSON object ( { "a": 1, "b" 2, ... } ) + number, ///< A JSON number ( 1 or -2.3 or 4.5e6 ...) + string, ///< A JSON string ( "a" or "hello world\n" ...) + boolean, ///< A JSON boolean (true or false) + null ///< A JSON null (null) +}; + +/** + * A type representing a JSON number. + * The design of the struct is deliberately straight-forward. All + * functions return standard values with no error check. + */ +struct number { + + /** + * return the automatically determined type of + * the number: number_type::floating_point_number, + * number_type::signed_integer or number_type::unsigned_integer. + * + * enum class number_type { + * floating_point_number=1, /// a binary64 number + * signed_integer, /// a signed integer that fits in a 64-bit word using two's complement + * unsigned_integer /// a positive integer larger or equal to 1<<63 + * }; + */ + simdjson_inline ondemand::number_type get_number_type() const noexcept; + /** + * return true if the automatically determined type of + * the number is number_type::unsigned_integer. + */ + simdjson_inline bool is_uint64() const noexcept; + /** + * return the value as a uint64_t, only valid if is_uint64() is true. + */ + simdjson_inline uint64_t get_uint64() const noexcept; + simdjson_inline operator uint64_t() const noexcept; + + /** + * return true if the automatically determined type of + * the number is number_type::signed_integer. + */ + simdjson_inline bool is_int64() const noexcept; + /** + * return the value as a int64_t, only valid if is_int64() is true. + */ + simdjson_inline int64_t get_int64() const noexcept; + simdjson_inline operator int64_t() const noexcept; + + + /** + * return true if the automatically determined type of + * the number is number_type::floating_point_number. + */ + simdjson_inline bool is_double() const noexcept; + /** + * return the value as a double, only valid if is_double() is true. + */ + simdjson_inline double get_double() const noexcept; + simdjson_inline operator double() const noexcept; + + /** + * Convert the number to a double. Though it always succeed, the conversion + * may be lossy if the number cannot be represented exactly. + */ + simdjson_inline double as_double() const noexcept; + + +protected: + /** + * The next block of declaration is designed so that we can call the number parsing + * functions on a number type. They are protected and should never be used outside + * of the core simdjson library. + */ + friend class value_iterator; + template + friend error_code numberparsing::write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer); + template + friend error_code numberparsing::parse_number(const uint8_t *const src, W &writer); + /** Store a signed 64-bit value to the number. */ + simdjson_inline void append_s64(int64_t value) noexcept; + /** Store an unsigned 64-bit value to the number. */ + simdjson_inline void append_u64(uint64_t value) noexcept; + /** Store a double value to the number. */ + simdjson_inline void append_double(double value) noexcept; + /** Specifies that the value is a double, but leave it undefined. */ + simdjson_inline void skip_double() noexcept; + /** + * End of friend declarations. + */ + + /** + * Our attributes are a union type (size = 64 bits) + * followed by a type indicator. + */ + union { + double floating_point_number; + int64_t signed_integer; + uint64_t unsigned_integer; + } payload{0}; + number_type type{number_type::signed_integer}; +}; + +/** + * Write the JSON type to the output stream + * + * @param out The output stream. + * @param type The json_type. + */ +inline std::ostream& operator<<(std::ostream& out, json_type type) noexcept; + +#if SIMDJSON_EXCEPTIONS +/** + * Send JSON type to an output stream. + * + * @param out The output stream. + * @param type The json_type. + * @throw simdjson_error if the result being printed has an error. If there is an error with the + * underlying output stream, that error will be propagated (simdjson_error will not be + * thrown). + */ +inline std::ostream& operator<<(std::ostream& out, simdjson_result &type) noexcept(false); +#endif + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::json_type &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; + simdjson_inline ~simdjson_result() noexcept = default; ///< @private +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_JSON_TYPE_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/logger-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/logger-inl.h new file mode 100644 index 000000000000..268fb2d1be12 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/logger-inl.h @@ -0,0 +1,225 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_LOGGER_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_LOGGER_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/logger.h" +#include "simdjson/generic/ondemand/json_iterator.h" +#include "simdjson/generic/ondemand/value_iterator.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include +#include + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { +namespace logger { + +static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"; +static constexpr const int LOG_EVENT_LEN = 20; +static constexpr const int LOG_BUFFER_LEN = 30; +static constexpr const int LOG_SMALL_BUFFER_LEN = 10; +static int log_depth = 0; // Not threadsafe. Log only. + +// Helper to turn unprintable or newline characters into spaces +static inline char printable_char(char c) { + if (c >= 0x20) { + return c; + } else { + return ' '; + } +} + +template +static inline std::string string_format(const std::string& format, const Args&... args) +{ + SIMDJSON_PUSH_DISABLE_ALL_WARNINGS + int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + 1; + auto size = static_cast(size_s); + if (size <= 0) return std::string(); + std::unique_ptr buf(new char[size]); + std::snprintf(buf.get(), size, format.c_str(), args...); + SIMDJSON_POP_DISABLE_WARNINGS + return std::string(buf.get(), buf.get() + size - 1); +} + +static inline log_level get_log_level_from_env() +{ + SIMDJSON_PUSH_DISABLE_WARNINGS + SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe + char *lvl = getenv("SIMDJSON_LOG_LEVEL"); + SIMDJSON_POP_DISABLE_WARNINGS + if (lvl && simdjson_strcasecmp(lvl, "ERROR") == 0) { return log_level::error; } + return log_level::info; +} + +static inline log_level log_threshold() +{ + static log_level threshold = get_log_level_from_env(); + return threshold; +} + +static inline bool should_log(log_level level) +{ + return level >= log_threshold(); +} + +inline void log_event(const json_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { + log_line(iter, "", type, detail, delta, depth_delta, log_level::info); +} + +inline void log_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail) noexcept { + log_line(iter, index, depth, "", type, detail, log_level::info); +} +inline void log_value(const json_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { + log_line(iter, "", type, detail, delta, depth_delta, log_level::info); +} + +inline void log_start_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail) noexcept { + log_line(iter, index, depth, "+", type, detail, log_level::info); + if (LOG_ENABLED) { log_depth++; } +} +inline void log_start_value(const json_iterator &iter, const char *type, int delta, int depth_delta) noexcept { + log_line(iter, "+", type, "", delta, depth_delta, log_level::info); + if (LOG_ENABLED) { log_depth++; } +} + +inline void log_end_value(const json_iterator &iter, const char *type, int delta, int depth_delta) noexcept { + if (LOG_ENABLED) { log_depth--; } + log_line(iter, "-", type, "", delta, depth_delta, log_level::info); +} + +inline void log_error(const json_iterator &iter, const char *error, const char *detail, int delta, int depth_delta) noexcept { + log_line(iter, "ERROR: ", error, detail, delta, depth_delta, log_level::error); +} +inline void log_error(const json_iterator &iter, token_position index, depth_t depth, const char *error, const char *detail) noexcept { + log_line(iter, index, depth, "ERROR: ", error, detail, log_level::error); +} + +inline void log_event(const value_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { + log_event(iter.json_iter(), type, detail, delta, depth_delta); +} + +inline void log_value(const value_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept { + log_value(iter.json_iter(), type, detail, delta, depth_delta); +} + +inline void log_start_value(const value_iterator &iter, const char *type, int delta, int depth_delta) noexcept { + log_start_value(iter.json_iter(), type, delta, depth_delta); +} + +inline void log_end_value(const value_iterator &iter, const char *type, int delta, int depth_delta) noexcept { + log_end_value(iter.json_iter(), type, delta, depth_delta); +} + +inline void log_error(const value_iterator &iter, const char *error, const char *detail, int delta, int depth_delta) noexcept { + log_error(iter.json_iter(), error, detail, delta, depth_delta); +} + +inline void log_headers() noexcept { + if (LOG_ENABLED) { + if (simdjson_unlikely(should_log(log_level::info))) { + // Technically a static variable is not thread-safe, but if you are using threads and logging... well... + static bool displayed_hint{false}; + log_depth = 0; + printf("\n"); + if (!displayed_hint) { + // We only print this helpful header once. + printf("# Logging provides the depth and position of the iterator user-visible steps:\n"); + printf("# +array says 'this is where we were when we discovered the start array'\n"); + printf( + "# -array says 'this is where we were when we ended the array'\n"); + printf("# skip says 'this is a structural or value I am skipping'\n"); + printf("# +/-skip says 'this is a start/end array or object I am skipping'\n"); + printf("#\n"); + printf("# The indentation of the terms (array, string,...) indicates the depth,\n"); + printf("# in addition to the depth being displayed.\n"); + printf("#\n"); + printf("# Every token in the document has a single depth determined by the tokens before it,\n"); + printf("# and is not affected by what the token actually is.\n"); + printf("#\n"); + printf("# Not all structural elements are presented as tokens in the logs.\n"); + printf("#\n"); + printf("# We never give control to the user within an empty array or an empty object.\n"); + printf("#\n"); + printf("# Inside an array, having a depth greater than the array's depth means that\n"); + printf("# we are pointing inside a value.\n"); + printf("# Having a depth equal to the array means that we are pointing right before a value.\n"); + printf("# Having a depth smaller than the array means that we have moved beyond the array.\n"); + displayed_hint = true; + } + printf("\n"); + printf("| %-*s ", LOG_EVENT_LEN, "Event"); + printf("| %-*s ", LOG_BUFFER_LEN, "Buffer"); + printf("| %-*s ", LOG_SMALL_BUFFER_LEN, "Next"); + // printf("| %-*s ", 5, "Next#"); + printf("| %-*s ", 5, "Depth"); + printf("| Detail "); + printf("|\n"); + + printf("|%.*s", LOG_EVENT_LEN + 2, DASHES); + printf("|%.*s", LOG_BUFFER_LEN + 2, DASHES); + printf("|%.*s", LOG_SMALL_BUFFER_LEN + 2, DASHES); + // printf("|%.*s", 5+2, DASHES); + printf("|%.*s", 5 + 2, DASHES); + printf("|--------"); + printf("|\n"); + fflush(stdout); + } + } +} + +template +inline void log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta, log_level level, Args&&... args) noexcept { + log_line(iter, iter.position()+delta, depth_t(iter.depth()+depth_delta), title_prefix, title, detail, level, std::forward(args)...); +} + +template +inline void log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail, log_level level, Args&&... args) noexcept { + if (LOG_ENABLED) { + if (simdjson_unlikely(should_log(level))) { + const int indent = depth * 2; + const auto buf = iter.token.buf; + auto msg = string_format(title, std::forward(args)...); + printf("| %*s%s%-*s ", indent, "", title_prefix, + LOG_EVENT_LEN - indent - int(strlen(title_prefix)), msg.c_str()); + { + // Print the current structural. + printf("| "); + // Before we begin, the index might point right before the document. + // This could be unsafe, see https://github.com/simdjson/simdjson/discussions/1938 + if (index < iter._root) { + printf("%*s", LOG_BUFFER_LEN, ""); + } else { + auto current_structural = &buf[*index]; + for (int i = 0; i < LOG_BUFFER_LEN; i++) { + printf("%c", printable_char(current_structural[i])); + } + } + printf(" "); + } + { + // Print the next structural. + printf("| "); + auto next_structural = &buf[*(index + 1)]; + for (int i = 0; i < LOG_SMALL_BUFFER_LEN; i++) { + printf("%c", printable_char(next_structural[i])); + } + printf(" "); + } + // printf("| %5u ", *(index+1)); + printf("| %5i ", depth); + printf("| %6.*s ", int(detail.size()), detail.data()); + printf("|\n"); + fflush(stdout); + } + } +} + +} // namespace logger +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_LOGGER_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/logger.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/logger.h new file mode 100644 index 000000000000..7696600f98c2 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/logger.h @@ -0,0 +1,58 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_LOGGER_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_LOGGER_H +#include "simdjson/generic/ondemand/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +// Logging should be free unless SIMDJSON_VERBOSE_LOGGING is set. Importantly, it is critical +// that the call to the log functions be side-effect free. Thus, for example, you should not +// create temporary std::string instances. +namespace logger { + +enum class log_level : int32_t { + info = 0, + error = 1 +}; + +#if SIMDJSON_VERBOSE_LOGGING + static constexpr const bool LOG_ENABLED = true; +#else + static constexpr const bool LOG_ENABLED = false; +#endif + +// We do not want these functions to be 'really inlined' since real inlining is +// for performance purposes and if you are using the loggers, you do not care about +// performance (or should not). +static inline void log_headers() noexcept; +// If args are provided, title will be treated as format string +template +static inline void log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail, logger::log_level level, Args&&... args) noexcept; +template +static inline void log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta, logger::log_level level, Args&&... args) noexcept; +static inline void log_event(const json_iterator &iter, const char *type, std::string_view detail="", int delta=0, int depth_delta=0) noexcept; +static inline void log_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail="") noexcept; +static inline void log_value(const json_iterator &iter, const char *type, std::string_view detail="", int delta=-1, int depth_delta=0) noexcept; +static inline void log_start_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail="") noexcept; +static inline void log_start_value(const json_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; +static inline void log_end_value(const json_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; + +static inline void log_error(const json_iterator &iter, token_position index, depth_t depth, const char *error, const char *detail="") noexcept; +static inline void log_error(const json_iterator &iter, const char *error, const char *detail="", int delta=-1, int depth_delta=0) noexcept; + +static inline void log_event(const value_iterator &iter, const char *type, std::string_view detail="", int delta=0, int depth_delta=0) noexcept; +static inline void log_value(const value_iterator &iter, const char *type, std::string_view detail="", int delta=-1, int depth_delta=0) noexcept; +static inline void log_start_value(const value_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; +static inline void log_end_value(const value_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept; +static inline void log_error(const value_iterator &iter, const char *error, const char *detail="", int delta=-1, int depth_delta=0) noexcept; + +} // namespace logger +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_LOGGER_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/object-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/object-inl.h new file mode 100644 index 000000000000..624d66838306 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/object-inl.h @@ -0,0 +1,276 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_OBJECT_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_OBJECT_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/field.h" +#include "simdjson/generic/ondemand/object.h" +#include "simdjson/generic/ondemand/object_iterator.h" +#include "simdjson/generic/ondemand/raw_json_string.h" +#include "simdjson/generic/ondemand/json_iterator.h" +#include "simdjson/generic/ondemand/value-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +simdjson_inline simdjson_result object::find_field_unordered(const std::string_view key) & noexcept { + bool has_value; + SIMDJSON_TRY( iter.find_field_unordered_raw(key).get(has_value) ); + if (!has_value) { + logger::log_line(iter.json_iter(), "ERROR: ", "Cannot find key %.*s", "", -1, 0, logger::log_level::error, static_cast(key.size()), key.data()); + return NO_SUCH_FIELD; + } + return value(iter.child()); +} +simdjson_inline simdjson_result object::find_field_unordered(const std::string_view key) && noexcept { + bool has_value; + SIMDJSON_TRY( iter.find_field_unordered_raw(key).get(has_value) ); + if (!has_value) { + logger::log_line(iter.json_iter(), "ERROR: ", "Cannot find key %.*s", "", -1, 0, logger::log_level::error, static_cast(key.size()), key.data()); + return NO_SUCH_FIELD; + } + return value(iter.child()); +} +simdjson_inline simdjson_result object::operator[](const std::string_view key) & noexcept { + return find_field_unordered(key); +} +simdjson_inline simdjson_result object::operator[](const std::string_view key) && noexcept { + return std::forward(*this).find_field_unordered(key); +} +simdjson_inline simdjson_result object::find_field(const std::string_view key) & noexcept { + bool has_value; + SIMDJSON_TRY( iter.find_field_raw(key).get(has_value) ); + if (!has_value) { + logger::log_line(iter.json_iter(), "ERROR: ", "Cannot find key %.*s", "", -1, 0, logger::log_level::error, static_cast(key.size()), key.data()); + return NO_SUCH_FIELD; + } + return value(iter.child()); +} +simdjson_inline simdjson_result object::find_field(const std::string_view key) && noexcept { + bool has_value; + SIMDJSON_TRY( iter.find_field_raw(key).get(has_value) ); + if (!has_value) { + logger::log_line(iter.json_iter(), "ERROR: ", "Cannot find key %.*s", "", -1, 0, logger::log_level::error, static_cast(key.size()), key.data()); + return NO_SUCH_FIELD; + } + return value(iter.child()); +} + +simdjson_inline simdjson_result object::start(value_iterator &iter) noexcept { + SIMDJSON_TRY( iter.start_object().error() ); + return object(iter); +} +simdjson_inline simdjson_result object::start_root(value_iterator &iter) noexcept { + SIMDJSON_TRY( iter.start_root_object().error() ); + return object(iter); +} +simdjson_inline error_code object::consume() noexcept { + if(iter.is_at_key()) { + /** + * whenever you are pointing at a key, calling skip_child() is + * unsafe because you will hit a string and you will assume that + * it is string value, and this mistake will lead you to make bad + * depth computation. + */ + /** + * We want to 'consume' the key. We could really + * just do _json_iter->return_current_and_advance(); at this + * point, but, for clarity, we will use the high-level API to + * eat the key. We assume that the compiler optimizes away + * most of the work. + */ + simdjson_unused raw_json_string actual_key; + auto error = iter.field_key().get(actual_key); + if (error) { iter.abandon(); return error; }; + // Let us move to the value while we are at it. + if ((error = iter.field_value())) { iter.abandon(); return error; } + } + auto error_skip = iter.json_iter().skip_child(iter.depth()-1); + if(error_skip) { iter.abandon(); } + return error_skip; +} + +simdjson_inline simdjson_result object::raw_json() noexcept { + const uint8_t * starting_point{iter.peek_start()}; + auto error = consume(); + if(error) { return error; } + const uint8_t * final_point{iter._json_iter->peek()}; + return std::string_view(reinterpret_cast(starting_point), size_t(final_point - starting_point)); +} + +simdjson_inline simdjson_result object::started(value_iterator &iter) noexcept { + SIMDJSON_TRY( iter.started_object().error() ); + return object(iter); +} + +simdjson_inline object object::resume(const value_iterator &iter) noexcept { + return iter; +} + +simdjson_inline object::object(const value_iterator &_iter) noexcept + : iter{_iter} +{ +} + +simdjson_inline simdjson_result object::begin() noexcept { +#if SIMDJSON_DEVELOPMENT_CHECKS + if (!iter.is_at_iterator_start()) { return OUT_OF_ORDER_ITERATION; } +#endif + return object_iterator(iter); +} +simdjson_inline simdjson_result object::end() noexcept { + return object_iterator(iter); +} + +inline simdjson_result object::at_pointer(std::string_view json_pointer) noexcept { + if (json_pointer[0] != '/') { return INVALID_JSON_POINTER; } + json_pointer = json_pointer.substr(1); + size_t slash = json_pointer.find('/'); + std::string_view key = json_pointer.substr(0, slash); + // Grab the child with the given key + simdjson_result child; + + // If there is an escape character in the key, unescape it and then get the child. + size_t escape = key.find('~'); + if (escape != std::string_view::npos) { + // Unescape the key + std::string unescaped(key); + do { + switch (unescaped[escape+1]) { + case '0': + unescaped.replace(escape, 2, "~"); + break; + case '1': + unescaped.replace(escape, 2, "/"); + break; + default: + return INVALID_JSON_POINTER; // "Unexpected ~ escape character in JSON pointer"); + } + escape = unescaped.find('~', escape+1); + } while (escape != std::string::npos); + child = find_field(unescaped); // Take note find_field does not unescape keys when matching + } else { + child = find_field(key); + } + if(child.error()) { + return child; // we do not continue if there was an error + } + // If there is a /, we have to recurse and look up more of the path + if (slash != std::string_view::npos) { + child = child.at_pointer(json_pointer.substr(slash)); + } + return child; +} + +inline simdjson_result object::at_path(std::string_view json_path) noexcept { + auto json_pointer = json_path_to_pointer_conversion(json_path); + if (json_pointer == "-1") { + return INVALID_JSON_POINTER; + } + return at_pointer(json_pointer); +} + +simdjson_inline simdjson_result object::count_fields() & noexcept { + size_t count{0}; + // Important: we do not consume any of the values. + for(simdjson_unused auto v : *this) { count++; } + // The above loop will always succeed, but we want to report errors. + if(iter.error()) { return iter.error(); } + // We need to move back at the start because we expect users to iterate through + // the object after counting the number of elements. + iter.reset_object(); + return count; +} + +simdjson_inline simdjson_result object::is_empty() & noexcept { + bool is_not_empty; + auto error = iter.reset_object().get(is_not_empty); + if(error) { return error; } + return !is_not_empty; +} + +simdjson_inline simdjson_result object::reset() & noexcept { + return iter.reset_object(); +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::object &&value) noexcept + : implementation_simdjson_result_base(std::forward(value)) {} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : implementation_simdjson_result_base(error) {} + +simdjson_inline simdjson_result simdjson_result::begin() noexcept { + if (error()) { return error(); } + return first.begin(); +} +simdjson_inline simdjson_result simdjson_result::end() noexcept { + if (error()) { return error(); } + return first.end(); +} +simdjson_inline simdjson_result simdjson_result::find_field_unordered(std::string_view key) & noexcept { + if (error()) { return error(); } + return first.find_field_unordered(key); +} +simdjson_inline simdjson_result simdjson_result::find_field_unordered(std::string_view key) && noexcept { + if (error()) { return error(); } + return std::forward(first).find_field_unordered(key); +} +simdjson_inline simdjson_result simdjson_result::operator[](std::string_view key) & noexcept { + if (error()) { return error(); } + return first[key]; +} +simdjson_inline simdjson_result simdjson_result::operator[](std::string_view key) && noexcept { + if (error()) { return error(); } + return std::forward(first)[key]; +} +simdjson_inline simdjson_result simdjson_result::find_field(std::string_view key) & noexcept { + if (error()) { return error(); } + return first.find_field(key); +} +simdjson_inline simdjson_result simdjson_result::find_field(std::string_view key) && noexcept { + if (error()) { return error(); } + return std::forward(first).find_field(key); +} + +simdjson_inline simdjson_result simdjson_result::at_pointer(std::string_view json_pointer) noexcept { + if (error()) { return error(); } + return first.at_pointer(json_pointer); +} + +simdjson_inline simdjson_result simdjson_result::at_path( + std::string_view json_path) noexcept { + if (error()) { + return error(); + } + return first.at_path(json_path); +} + +inline simdjson_result simdjson_result::reset() noexcept { + if (error()) { return error(); } + return first.reset(); +} + +inline simdjson_result simdjson_result::is_empty() noexcept { + if (error()) { return error(); } + return first.is_empty(); +} + +simdjson_inline simdjson_result simdjson_result::count_fields() & noexcept { + if (error()) { return error(); } + return first.count_fields(); +} + +simdjson_inline simdjson_result simdjson_result::raw_json() noexcept { + if (error()) { return error(); } + return first.raw_json(); +} +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_OBJECT_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/object.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/object.h new file mode 100644 index 000000000000..8e3ed9af39f5 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/object.h @@ -0,0 +1,258 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_OBJECT_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_OBJECT_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#include "simdjson/generic/ondemand/value_iterator.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * A forward-only JSON object field iterator. + */ +class object { +public: + /** + * Create a new invalid object. + * + * Exists so you can declare a variable and later assign to it before use. + */ + simdjson_inline object() noexcept = default; + + simdjson_inline simdjson_result begin() noexcept; + simdjson_inline simdjson_result end() noexcept; + /** + * Look up a field by name on an object (order-sensitive). + * + * The following code reads z, then y, then x, and thus will not retrieve x or y if fed the + * JSON `{ "x": 1, "y": 2, "z": 3 }`: + * + * ```c++ + * simdjson::ondemand::parser parser; + * auto obj = parser.parse(R"( { "x": 1, "y": 2, "z": 3 } )"_padded); + * double z = obj.find_field("z"); + * double y = obj.find_field("y"); + * double x = obj.find_field("x"); + * ``` + * If you have multiple fields with a matching key ({"x": 1, "x": 1}) be mindful + * that only one field is returned. + * + * **Raw Keys:** The lookup will be done against the *raw* key, and will not unescape keys. + * e.g. `object["a"]` will match `{ "a": 1 }`, but will *not* match `{ "\u0061": 1 }`. + * + * You must consume the fields on an object one at a time. A request for a new key + * invalidates previous field values: it makes them unsafe. The value instance you get + * from `content["bids"]` becomes invalid when you call `content["asks"]`. The array + * given by content["bids"].get_array() should not be accessed after you have called + * content["asks"].get_array(). You can detect such mistakes by first compiling and running + * the code in Debug mode (or with the macro `SIMDJSON_DEVELOPMENT_CHECKS` set to 1): an + * OUT_OF_ORDER_ITERATION error is generated. + * + * You are expected to access keys only once. You should access the value corresponding to a + * key a single time. Doing object["mykey"].to_string() and then again object["mykey"].to_string() + * is an error. + * + * If you expect to have keys with escape characters, please review our documentation. + * + * @param key The key to look up. + * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object. + */ + simdjson_inline simdjson_result find_field(std::string_view key) & noexcept; + /** @overload simdjson_inline simdjson_result find_field(std::string_view key) & noexcept; */ + simdjson_inline simdjson_result find_field(std::string_view key) && noexcept; + + /** + * Look up a field by name on an object, without regard to key order. + * + * **Performance Notes:** This is a bit less performant than find_field(), though its effect varies + * and often appears negligible. It starts out normally, starting out at the last field; but if + * the field is not found, it scans from the beginning of the object to see if it missed it. That + * missing case has a non-cache-friendly bump and lots of extra scanning, especially if the object + * in question is large. The fact that the extra code is there also bumps the executable size. + * + * It is the default, however, because it would be highly surprising (and hard to debug) if the + * default behavior failed to look up a field just because it was in the wrong order--and many + * APIs assume this. Therefore, you must be explicit if you want to treat objects as out of order. + * + * Use find_field() if you are sure fields will be in order (or are willing to treat it as if the + * field was not there when they are not in order). + * + * If you have multiple fields with a matching key ({"x": 1, "x": 1}) be mindful + * that only one field is returned. + * + * You must consume the fields on an object one at a time. A request for a new key + * invalidates previous field values: it makes them unsafe. The value instance you get + * from `content["bids"]` becomes invalid when you call `content["asks"]`. The array + * given by content["bids"].get_array() should not be accessed after you have called + * content["asks"].get_array(). You can detect such mistakes by first compiling and running + * the code in Debug mode (or with the macro `SIMDJSON_DEVELOPMENT_CHECKS` set to 1): an + * OUT_OF_ORDER_ITERATION error is generated. + * + * You are expected to access keys only once. You should access the value corresponding to a key + * a single time. Doing object["mykey"].to_string() and then again object["mykey"].to_string() is an error. + * + * If you expect to have keys with escape characters, please review our documentation. + * + * @param key The key to look up. + * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object. + */ + simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; */ + simdjson_inline simdjson_result find_field_unordered(std::string_view key) && noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; */ + simdjson_inline simdjson_result operator[](std::string_view key) & noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; */ + simdjson_inline simdjson_result operator[](std::string_view key) && noexcept; + + /** + * Get the value associated with the given JSON pointer. We use the RFC 6901 + * https://tools.ietf.org/html/rfc6901 standard, interpreting the current node + * as the root of its own JSON document. + * + * ondemand::parser parser; + * auto json = R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("/foo/a/1") == 20 + * + * It is allowed for a key to be the empty string: + * + * ondemand::parser parser; + * auto json = R"({ "": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("//a/1") == 20 + * + * Note that at_pointer() called on the document automatically calls the document's rewind + * method between each call. It invalidates all previously accessed arrays, objects and values + * that have not been consumed. Yet it is not the case when calling at_pointer on an object + * instance: there is no rewind and no invalidation. + * + * You may call at_pointer more than once on an object, but each time the pointer is advanced + * to be within the value matched by the key indicated by the JSON pointer query. Thus any preceding + * key (as well as the current key) can no longer be used with following JSON pointer calls. + * + * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching. + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + */ + inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + + /** + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key + * names and array indices. + * + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + */ + inline simdjson_result at_path(std::string_view json_path) noexcept; + + /** + * Reset the iterator so that we are pointing back at the + * beginning of the object. You should still consume values only once even if you + * can iterate through the object more than once. If you unescape a string or a key + * within the object more than once, you have unsafe code. Note that rewinding an object + * means that you may need to reparse it anew: it is not a free operation. + * + * @returns true if the object contains some elements (not empty) + */ + inline simdjson_result reset() & noexcept; + /** + * This method scans the beginning of the object and checks whether the + * object is empty. + * The runtime complexity is constant time. After + * calling this function, if successful, the object is 'rewinded' at its + * beginning as if it had never been accessed. If the JSON is malformed (e.g., + * there is a missing comma), then an error is returned and it is no longer + * safe to continue. + */ + inline simdjson_result is_empty() & noexcept; + /** + * This method scans the object and counts the number of key-value pairs. + * The count_fields method should always be called before you have begun + * iterating through the object: it is expected that you are pointing at + * the beginning of the object. + * The runtime complexity is linear in the size of the object. After + * calling this function, if successful, the object is 'rewinded' at its + * beginning as if it had never been accessed. If the JSON is malformed (e.g., + * there is a missing comma), then an error is returned and it is no longer + * safe to continue. + * + * To check that an object is empty, it is more performant to use + * the is_empty() method. + * + * Performance hint: You should only call count_fields() as a last + * resort as it may require scanning the document twice or more. + */ + simdjson_inline simdjson_result count_fields() & noexcept; + /** + * Consumes the object and returns a string_view instance corresponding to the + * object as represented in JSON. It points inside the original byte array containing + * the JSON document. + */ + simdjson_inline simdjson_result raw_json() noexcept; + +protected: + /** + * Go to the end of the object, no matter where you are right now. + */ + simdjson_inline error_code consume() noexcept; + static simdjson_inline simdjson_result start(value_iterator &iter) noexcept; + static simdjson_inline simdjson_result start_root(value_iterator &iter) noexcept; + static simdjson_inline simdjson_result started(value_iterator &iter) noexcept; + static simdjson_inline object resume(const value_iterator &iter) noexcept; + simdjson_inline object(const value_iterator &iter) noexcept; + + simdjson_warn_unused simdjson_inline error_code find_field_raw(const std::string_view key) noexcept; + + value_iterator iter{}; + + friend class value; + friend class document; + friend struct simdjson_result; +}; + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::object &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; + + simdjson_inline simdjson_result begin() noexcept; + simdjson_inline simdjson_result end() noexcept; + simdjson_inline simdjson_result find_field(std::string_view key) & noexcept; + simdjson_inline simdjson_result find_field(std::string_view key) && noexcept; + simdjson_inline simdjson_result find_field_unordered(std::string_view key) & noexcept; + simdjson_inline simdjson_result find_field_unordered(std::string_view key) && noexcept; + simdjson_inline simdjson_result operator[](std::string_view key) & noexcept; + simdjson_inline simdjson_result operator[](std::string_view key) && noexcept; + simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + simdjson_inline simdjson_result at_path(std::string_view json_path) noexcept; + + inline simdjson_result reset() noexcept; + inline simdjson_result is_empty() noexcept; + inline simdjson_result count_fields() & noexcept; + inline simdjson_result raw_json() noexcept; + +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_OBJECT_H diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/object_iterator-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/object_iterator-inl.h new file mode 100644 index 000000000000..36294ce7185e --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/object_iterator-inl.h @@ -0,0 +1,138 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_OBJECT_ITERATOR_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_OBJECT_ITERATOR_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/object_iterator.h" +#include "simdjson/generic/ondemand/field-inl.h" +#include "simdjson/generic/ondemand/value_iterator-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +// +// object_iterator +// + +simdjson_inline object_iterator::object_iterator(const value_iterator &_iter) noexcept + : iter{_iter} +{} + +simdjson_inline simdjson_result object_iterator::operator*() noexcept { + error_code error = iter.error(); + if (error) { iter.abandon(); return error; } + auto result = field::start(iter); + // TODO this is a safety rail ... users should exit loops as soon as they receive an error. + // Nonetheless, let's see if performance is OK with this if statement--the compiler may give it to us for free. + if (result.error()) { iter.abandon(); } + return result; +} +simdjson_inline bool object_iterator::operator==(const object_iterator &other) const noexcept { + return !(*this != other); +} +simdjson_inline bool object_iterator::operator!=(const object_iterator &) const noexcept { + return iter.is_open(); +} + +SIMDJSON_PUSH_DISABLE_WARNINGS +SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING +simdjson_inline object_iterator &object_iterator::operator++() noexcept { + // TODO this is a safety rail ... users should exit loops as soon as they receive an error. + // Nonetheless, let's see if performance is OK with this if statement--the compiler may give it to us for free. + if (!iter.is_open()) { return *this; } // Iterator will be released if there is an error + + simdjson_unused error_code error; + if ((error = iter.skip_child() )) { return *this; } + + simdjson_unused bool has_value; + if ((error = iter.has_next_field().get(has_value) )) { return *this; }; + return *this; +} +SIMDJSON_POP_DISABLE_WARNINGS + +// +// ### Live States +// +// While iterating or looking up values, depth >= iter.depth. at_start may vary. Error is +// always SUCCESS: +// +// - Start: This is the state when the object is first found and the iterator is just past the {. +// In this state, at_start == true. +// - Next: After we hand a scalar value to the user, or an array/object which they then fully +// iterate over, the iterator is at the , or } before the next value. In this state, +// depth == iter.depth, at_start == false, and error == SUCCESS. +// - Unfinished Business: When we hand an array/object to the user which they do not fully +// iterate over, we need to finish that iteration by skipping child values until we reach the +// Next state. In this state, depth > iter.depth, at_start == false, and error == SUCCESS. +// +// ## Error States +// +// In error states, we will yield exactly one more value before stopping. iter.depth == depth +// and at_start is always false. We decrement after yielding the error, moving to the Finished +// state. +// +// - Chained Error: When the object iterator is part of an error chain--for example, in +// `for (auto tweet : doc["tweets"])`, where the tweet field may be missing or not be an +// object--we yield that error in the loop, exactly once. In this state, error != SUCCESS and +// iter.depth == depth, and at_start == false. We decrement depth when we yield the error. +// - Missing Comma Error: When the iterator ++ method discovers there is no comma between fields, +// we flag that as an error and treat it exactly the same as a Chained Error. In this state, +// error == TAPE_ERROR, iter.depth == depth, and at_start == false. +// +// Errors that occur while reading a field to give to the user (such as when the key is not a +// string or the field is missing a colon) are yielded immediately. Depth is then decremented, +// moving to the Finished state without transitioning through an Error state at all. +// +// ## Terminal State +// +// The terminal state has iter.depth < depth. at_start is always false. +// +// - Finished: When we have reached a }, we are finished. We signal this by decrementing depth. +// In this state, iter.depth < depth, at_start == false, and error == SUCCESS. +// + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result( + SIMDJSON_IMPLEMENTATION::ondemand::object_iterator &&value +) noexcept + : implementation_simdjson_result_base(std::forward(value)) +{ + first.iter.assert_is_valid(); +} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : implementation_simdjson_result_base({}, error) +{ +} + +simdjson_inline simdjson_result simdjson_result::operator*() noexcept { + if (error()) { return error(); } + return *first; +} +// If we're iterating and there is an error, return the error once. +simdjson_inline bool simdjson_result::operator==(const simdjson_result &other) const noexcept { + if (!first.iter.is_valid()) { return !error(); } + return first == other.first; +} +// If we're iterating and there is an error, return the error once. +simdjson_inline bool simdjson_result::operator!=(const simdjson_result &other) const noexcept { + if (!first.iter.is_valid()) { return error(); } + return first != other.first; +} +// Checks for ']' and ',' +simdjson_inline simdjson_result &simdjson_result::operator++() noexcept { + // Clear the error if there is one, so we don't yield it twice + if (error()) { second = SUCCESS; return *this; } + ++first; + return *this; +} + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_OBJECT_ITERATOR_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/object_iterator.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/object_iterator.h new file mode 100644 index 000000000000..1cdc3f43b359 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/object_iterator.h @@ -0,0 +1,80 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_OBJECT_ITERATOR_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_OBJECT_ITERATOR_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#include "simdjson/generic/ondemand/value_iterator.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +class object_iterator { +public: + /** + * Create a new invalid object_iterator. + * + * Exists so you can declare a variable and later assign to it before use. + */ + simdjson_inline object_iterator() noexcept = default; + + // + // Iterator interface + // + + // Reads key and value, yielding them to the user. + // MUST ONLY BE CALLED ONCE PER ITERATION. + simdjson_inline simdjson_result operator*() noexcept; + // Assumes it's being compared with the end. true if depth < iter->depth. + simdjson_inline bool operator==(const object_iterator &) const noexcept; + // Assumes it's being compared with the end. true if depth >= iter->depth. + simdjson_inline bool operator!=(const object_iterator &) const noexcept; + // Checks for ']' and ',' + simdjson_inline object_iterator &operator++() noexcept; + +private: + /** + * The underlying JSON iterator. + * + * PERF NOTE: expected to be elided in favor of the parent document: this is set when the object + * is first used, and never changes afterwards. + */ + value_iterator iter{}; + + simdjson_inline object_iterator(const value_iterator &iter) noexcept; + friend struct simdjson_result; + friend class object; +}; + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::object_iterator &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; + + // + // Iterator interface + // + + // Reads key and value, yielding them to the user. + simdjson_inline simdjson_result operator*() noexcept; // MUST ONLY BE CALLED ONCE PER ITERATION. + // Assumes it's being compared with the end. true if depth < iter->depth. + simdjson_inline bool operator==(const simdjson_result &) const noexcept; + // Assumes it's being compared with the end. true if depth >= iter->depth. + simdjson_inline bool operator!=(const simdjson_result &) const noexcept; + // Checks for ']' and ',' + simdjson_inline simdjson_result &operator++() noexcept; +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_OBJECT_ITERATOR_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/parser-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/parser-inl.h new file mode 100644 index 000000000000..8350fe73831b --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/parser-inl.h @@ -0,0 +1,205 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_PARSER_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_PARSER_INL_H +#include "simdjson/padded_string.h" +#include "simdjson/padded_string_view.h" +#include "simdjson/implementation.h" +#include "simdjson/internal/dom_parser_implementation.h" +#include "simdjson/dom/base.h" // for MINIMAL_DOCUMENT_CAPACITY +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/document_stream.h" +#include "simdjson/generic/ondemand/parser.h" +#include "simdjson/generic/ondemand/raw_json_string.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +simdjson_inline parser::parser(size_t max_capacity) noexcept + : _max_capacity{max_capacity} { +} + +simdjson_warn_unused simdjson_inline error_code parser::allocate(size_t new_capacity, size_t new_max_depth) noexcept { + if (new_capacity > max_capacity()) { return CAPACITY; } + if (string_buf && new_capacity == capacity() && new_max_depth == max_depth()) { return SUCCESS; } + + // string_capacity copied from document::allocate + _capacity = 0; + size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + SIMDJSON_PADDING, 64); + string_buf.reset(new (std::nothrow) uint8_t[string_capacity]); +#if SIMDJSON_DEVELOPMENT_CHECKS + start_positions.reset(new (std::nothrow) token_position[new_max_depth]); +#endif + if (implementation) { + SIMDJSON_TRY( implementation->set_capacity(new_capacity) ); + SIMDJSON_TRY( implementation->set_max_depth(new_max_depth) ); + } else { + SIMDJSON_TRY( simdjson::get_active_implementation()->create_dom_parser_implementation(new_capacity, new_max_depth, implementation) ); + } + _capacity = new_capacity; + _max_depth = new_max_depth; + return SUCCESS; +} +#if SIMDJSON_DEVELOPMENT_CHECKS +simdjson_inline simdjson_warn_unused bool parser::string_buffer_overflow(const uint8_t *string_buf_loc) const noexcept { + return (string_buf_loc < string_buf.get()) || (size_t(string_buf_loc - string_buf.get()) >= capacity()); +} +#endif + +simdjson_warn_unused simdjson_inline simdjson_result parser::iterate(padded_string_view json) & noexcept { + if (json.padding() < SIMDJSON_PADDING) { return INSUFFICIENT_PADDING; } + + json.remove_utf8_bom(); + + // Allocate if needed + if (capacity() < json.length() || !string_buf) { + SIMDJSON_TRY( allocate(json.length(), max_depth()) ); + } + + // Run stage 1. + SIMDJSON_TRY( implementation->stage1(reinterpret_cast(json.data()), json.length(), stage1_mode::regular) ); + return document::start({ reinterpret_cast(json.data()), this }); +} + +#ifdef SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON +simdjson_warn_unused simdjson_inline simdjson_result parser::iterate_allow_incomplete_json(padded_string_view json) & noexcept { + if (json.padding() < SIMDJSON_PADDING) { return INSUFFICIENT_PADDING; } + + json.remove_utf8_bom(); + + // Allocate if needed + if (capacity() < json.length() || !string_buf) { + SIMDJSON_TRY( allocate(json.length(), max_depth()) ); + } + + // Run stage 1. + const simdjson::error_code err = implementation->stage1(reinterpret_cast(json.data()), json.length(), stage1_mode::regular); + if (err) { + if (err != UNCLOSED_STRING) + return err; + } + return document::start({ reinterpret_cast(json.data()), this, true }); +} +#endif // SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON + +simdjson_warn_unused simdjson_inline simdjson_result parser::iterate(const char *json, size_t len, size_t allocated) & noexcept { + return iterate(padded_string_view(json, len, allocated)); +} + +simdjson_warn_unused simdjson_inline simdjson_result parser::iterate(const uint8_t *json, size_t len, size_t allocated) & noexcept { + return iterate(padded_string_view(json, len, allocated)); +} + +simdjson_warn_unused simdjson_inline simdjson_result parser::iterate(std::string_view json, size_t allocated) & noexcept { + return iterate(padded_string_view(json, allocated)); +} + +simdjson_warn_unused simdjson_inline simdjson_result parser::iterate(std::string &json) & noexcept { + if(json.capacity() - json.size() < SIMDJSON_PADDING) { + json.reserve(json.size() + SIMDJSON_PADDING); + } + return iterate(padded_string_view(json)); +} + +simdjson_warn_unused simdjson_inline simdjson_result parser::iterate(const std::string &json) & noexcept { + return iterate(padded_string_view(json)); +} + +simdjson_warn_unused simdjson_inline simdjson_result parser::iterate(const simdjson_result &result) & noexcept { + // We don't presently have a way to temporarily get a const T& from a simdjson_result without throwing an exception + SIMDJSON_TRY( result.error() ); + padded_string_view json = result.value_unsafe(); + return iterate(json); +} + +simdjson_warn_unused simdjson_inline simdjson_result parser::iterate(const simdjson_result &result) & noexcept { + // We don't presently have a way to temporarily get a const T& from a simdjson_result without throwing an exception + SIMDJSON_TRY( result.error() ); + const padded_string &json = result.value_unsafe(); + return iterate(json); +} + +simdjson_warn_unused simdjson_inline simdjson_result parser::iterate_raw(padded_string_view json) & noexcept { + if (json.padding() < SIMDJSON_PADDING) { return INSUFFICIENT_PADDING; } + + json.remove_utf8_bom(); + + // Allocate if needed + if (capacity() < json.length()) { + SIMDJSON_TRY( allocate(json.length(), max_depth()) ); + } + + // Run stage 1. + SIMDJSON_TRY( implementation->stage1(reinterpret_cast(json.data()), json.length(), stage1_mode::regular) ); + return json_iterator(reinterpret_cast(json.data()), this); +} + +inline simdjson_result parser::iterate_many(const uint8_t *buf, size_t len, size_t batch_size, bool allow_comma_separated) noexcept { + if(batch_size < MINIMAL_BATCH_SIZE) { batch_size = MINIMAL_BATCH_SIZE; } + if((len >= 3) && (std::memcmp(buf, "\xEF\xBB\xBF", 3) == 0)) { + buf += 3; + len -= 3; + } + if(allow_comma_separated && batch_size < len) { batch_size = len; } + return document_stream(*this, buf, len, batch_size, allow_comma_separated); +} +inline simdjson_result parser::iterate_many(const char *buf, size_t len, size_t batch_size, bool allow_comma_separated) noexcept { + return iterate_many(reinterpret_cast(buf), len, batch_size, allow_comma_separated); +} +inline simdjson_result parser::iterate_many(const std::string &s, size_t batch_size, bool allow_comma_separated) noexcept { + return iterate_many(s.data(), s.length(), batch_size, allow_comma_separated); +} +inline simdjson_result parser::iterate_many(const padded_string &s, size_t batch_size, bool allow_comma_separated) noexcept { + return iterate_many(s.data(), s.length(), batch_size, allow_comma_separated); +} + +simdjson_pure simdjson_inline size_t parser::capacity() const noexcept { + return _capacity; +} +simdjson_pure simdjson_inline size_t parser::max_capacity() const noexcept { + return _max_capacity; +} +simdjson_pure simdjson_inline size_t parser::max_depth() const noexcept { + return _max_depth; +} + +simdjson_inline void parser::set_max_capacity(size_t max_capacity) noexcept { + if(max_capacity < dom::MINIMAL_DOCUMENT_CAPACITY) { + _max_capacity = max_capacity; + } else { + _max_capacity = dom::MINIMAL_DOCUMENT_CAPACITY; + } +} + +simdjson_inline simdjson_warn_unused simdjson_result parser::unescape(raw_json_string in, uint8_t *&dst, bool allow_replacement) const noexcept { + uint8_t *end = implementation->parse_string(in.buf, dst, allow_replacement); + if (!end) { return STRING_ERROR; } + std::string_view result(reinterpret_cast(dst), end-dst); + dst = end; + return result; +} + +simdjson_inline simdjson_warn_unused simdjson_result parser::unescape_wobbly(raw_json_string in, uint8_t *&dst) const noexcept { + uint8_t *end = implementation->parse_wobbly_string(in.buf, dst); + if (!end) { return STRING_ERROR; } + std::string_view result(reinterpret_cast(dst), end-dst); + dst = end; + return result; +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::parser &&value) noexcept + : implementation_simdjson_result_base(std::forward(value)) {} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : implementation_simdjson_result_base(error) {} + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_PARSER_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/parser.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/parser.h new file mode 100644 index 000000000000..f1f86d4843bf --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/parser.h @@ -0,0 +1,372 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_PARSER_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_PARSER_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * The default batch size for document_stream instances for this On-Demand kernel. + * Note that different On-Demand kernel may use a different DEFAULT_BATCH_SIZE value + * in the future. + */ +static constexpr size_t DEFAULT_BATCH_SIZE = 1000000; +/** + * Some adversary might try to set the batch size to 0 or 1, which might cause problems. + * We set a minimum of 32B since anything else is highly likely to be an error. In practice, + * most users will want a much larger batch size. + * + * All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON + * document can ever span 0 or 1 byte and that very large values would create memory allocation issues. + */ +static constexpr size_t MINIMAL_BATCH_SIZE = 32; + +/** + * A JSON fragment iterator. + * + * This holds the actual iterator as well as the buffer for writing strings. + */ +class parser { +public: + /** + * Create a JSON parser. + * + * The new parser will have zero capacity. + */ + inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept; + + inline parser(parser &&other) noexcept = default; + simdjson_inline parser(const parser &other) = delete; + simdjson_inline parser &operator=(const parser &other) = delete; + simdjson_inline parser &operator=(parser &&other) noexcept = default; + + /** Deallocate the JSON parser. */ + inline ~parser() noexcept = default; + + /** + * Start iterating an on-demand JSON document. + * + * ondemand::parser parser; + * document doc = parser.iterate(json); + * + * It is expected that the content is a valid UTF-8 file, containing a valid JSON document. + * Otherwise the iterate method may return an error. In particular, the whole input should be + * valid: we do not attempt to tolerate incorrect content either before or after a JSON + * document. If there is a UTF-8 BOM, the parser skips it. + * + * ### IMPORTANT: Validate what you use + * + * Calling iterate on an invalid JSON document may not immediately trigger an error. The call to + * iterate does not parse and validate the whole document. + * + * ### IMPORTANT: Buffer Lifetime + * + * Because parsing is done while you iterate, you *must* keep the JSON buffer around at least as + * long as the document iteration. + * + * ### IMPORTANT: Document Lifetime + * + * Only one iteration at a time can happen per parser, and the parser *must* be kept alive during + * iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before + * you call parse() again or destroy the parser. + * + * ### REQUIRED: Buffer Padding + * + * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what + * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you + * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the + * SIMDJSON_PADDING bytes to avoid runtime warnings. + * + * @param json The JSON to parse. + * @param len The length of the JSON. + * @param capacity The number of bytes allocated in the JSON (must be at least len+SIMDJSON_PADDING). + * + * @return The document, or an error: + * - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes. + * - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory + * allocation fails. + * - EMPTY if the document is all whitespace. + * - UTF8_ERROR if the document is not valid UTF-8. + * - UNESCAPED_CHARS if a string contains control characters that must be escaped + * - UNCLOSED_STRING if there is an unclosed string in the document. + */ + simdjson_warn_unused simdjson_result iterate(padded_string_view json) & noexcept; +#ifdef SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON + simdjson_warn_unused simdjson_result iterate_allow_incomplete_json(padded_string_view json) & noexcept; +#endif // SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON + /** @overload simdjson_result iterate(padded_string_view json) & noexcept */ + simdjson_warn_unused simdjson_result iterate(const char *json, size_t len, size_t capacity) & noexcept; + /** @overload simdjson_result iterate(padded_string_view json) & noexcept */ + simdjson_warn_unused simdjson_result iterate(const uint8_t *json, size_t len, size_t capacity) & noexcept; + /** @overload simdjson_result iterate(padded_string_view json) & noexcept */ + simdjson_warn_unused simdjson_result iterate(std::string_view json, size_t capacity) & noexcept; + /** @overload simdjson_result iterate(padded_string_view json) & noexcept */ + simdjson_warn_unused simdjson_result iterate(const std::string &json) & noexcept; + /** @overload simdjson_result iterate(padded_string_view json) & noexcept */ + simdjson_warn_unused simdjson_result iterate(std::string &json) & noexcept; + /** @overload simdjson_result iterate(padded_string_view json) & noexcept */ + simdjson_warn_unused simdjson_result iterate(const simdjson_result &json) & noexcept; + /** @overload simdjson_result iterate(padded_string_view json) & noexcept */ + simdjson_warn_unused simdjson_result iterate(const simdjson_result &json) & noexcept; + /** @overload simdjson_result iterate(padded_string_view json) & noexcept */ + simdjson_warn_unused simdjson_result iterate(padded_string &&json) & noexcept = delete; + + /** + * @private + * + * Start iterating an on-demand JSON document. + * + * ondemand::parser parser; + * json_iterator doc = parser.iterate(json); + * + * ### IMPORTANT: Buffer Lifetime + * + * Because parsing is done while you iterate, you *must* keep the JSON buffer around at least as + * long as the document iteration. + * + * ### IMPORTANT: Document Lifetime + * + * Only one iteration at a time can happen per parser, and the parser *must* be kept alive during + * iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before + * you call parse() again or destroy the parser. + * + * The ondemand::document instance holds the iterator. The document must remain in scope + * while you are accessing instances of ondemand::value, ondemand::object, ondemand::array. + * + * ### REQUIRED: Buffer Padding + * + * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what + * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you + * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the + * SIMDJSON_PADDING bytes to avoid runtime warnings. + * + * @param json The JSON to parse. + * + * @return The iterator, or an error: + * - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes. + * - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory + * allocation fails. + * - EMPTY if the document is all whitespace. + * - UTF8_ERROR if the document is not valid UTF-8. + * - UNESCAPED_CHARS if a string contains control characters that must be escaped + * - UNCLOSED_STRING if there is an unclosed string in the document. + */ + simdjson_warn_unused simdjson_result iterate_raw(padded_string_view json) & noexcept; + + + /** + * Parse a buffer containing many JSON documents. + * + * auto json = R"({ "foo": 1 } { "foo": 2 } { "foo": 3 } )"_padded; + * ondemand::parser parser; + * ondemand::document_stream docs = parser.iterate_many(json); + * for (auto & doc : docs) { + * std::cout << doc["foo"] << std::endl; + * } + * // Prints 1 2 3 + * + * No copy of the input buffer is made. + * + * The function is lazy: it may be that no more than one JSON document at a time is parsed. + * + * The caller is responsabile to ensure that the input string data remains unchanged and is + * not deleted during the loop. + * + * ### Format + * + * The buffer must contain a series of one or more JSON documents, concatenated into a single + * buffer, separated by ASCII whitespace. It effectively parses until it has a fully valid document, + * then starts parsing the next document at that point. (It does this with more parallelism and + * lookahead than you might think, though.) + * + * documents that consist of an object or array may omit the whitespace between them, concatenating + * with no separator. Documents that consist of a single primitive (i.e. documents that are not + * arrays or objects) MUST be separated with ASCII whitespace. + * + * The characters inside a JSON document, and between JSON documents, must be valid Unicode (UTF-8). + * If there is a UTF-8 BOM, the parser skips it. + * + * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse. + * Setting batch_size to excessively large or excessively small values may impact negatively the + * performance. + * + * ### REQUIRED: Buffer Padding + * + * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what + * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you + * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the + * SIMDJSON_PADDING bytes to avoid runtime warnings. + * + * ### Threads + * + * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the + * hood to do some lookahead. + * + * ### Parser Capacity + * + * If the parser's current capacity is less than batch_size, it will allocate enough capacity + * to handle it (up to max_capacity). + * + * @param buf The concatenated JSON to parse. + * @param len The length of the concatenated JSON. + * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet + * spot is cache-related: small enough to fit in cache, yet big enough to + * parse as many documents as possible in one tight loop. + * Defaults to 10MB, which has been a reasonable sweet spot in our tests. + * @param allow_comma_separated (defaults on false) This allows a mode where the documents are + * separated by commas instead of whitespace. It comes with a performance + * penalty because the entire document is indexed at once (and the document must be + * less than 4 GB), and there is no multithreading. In this mode, the batch_size parameter + * is effectively ignored, as it is set to at least the document size. + * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors: + * - MEMALLOC if the parser does not have enough capacity and memory allocation fails + * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity. + * - other json errors if parsing fails. You should not rely on these errors to always the same for the + * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). + */ + inline simdjson_result iterate_many(const uint8_t *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE, bool allow_comma_separated = false) noexcept; + /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ + inline simdjson_result iterate_many(const char *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE, bool allow_comma_separated = false) noexcept; + /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ + inline simdjson_result iterate_many(const std::string &s, size_t batch_size = DEFAULT_BATCH_SIZE, bool allow_comma_separated = false) noexcept; + inline simdjson_result iterate_many(const std::string &&s, size_t batch_size, bool allow_comma_separated = false) = delete;// unsafe + /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ + inline simdjson_result iterate_many(const padded_string &s, size_t batch_size = DEFAULT_BATCH_SIZE, bool allow_comma_separated = false) noexcept; + inline simdjson_result iterate_many(const padded_string &&s, size_t batch_size, bool allow_comma_separated = false) = delete;// unsafe + + /** @private We do not want to allow implicit conversion from C string to std::string. */ + simdjson_result iterate_many(const char *buf, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept = delete; + + /** The capacity of this parser (the largest document it can process). */ + simdjson_pure simdjson_inline size_t capacity() const noexcept; + /** The maximum capacity of this parser (the largest document it is allowed to process). */ + simdjson_pure simdjson_inline size_t max_capacity() const noexcept; + simdjson_inline void set_max_capacity(size_t max_capacity) noexcept; + /** + * The maximum depth of this parser (the most deeply nested objects and arrays it can process). + * This parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true. + * The document's instance current_depth() method should be used to monitor the parsing + * depth and limit it if desired. + */ + simdjson_pure simdjson_inline size_t max_depth() const noexcept; + + /** + * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length + * and `max_depth` depth. + * + * The max_depth parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true. + * The document's instance current_depth() method should be used to monitor the parsing + * depth and limit it if desired. + * + * @param capacity The new capacity. + * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. + * @return The error, if there is one. + */ + simdjson_warn_unused error_code allocate(size_t capacity, size_t max_depth=DEFAULT_MAX_DEPTH) noexcept; + + #ifdef SIMDJSON_THREADS_ENABLED + /** + * The parser instance can use threads when they are available to speed up some + * operations. It is enabled by default. Changing this attribute will change the + * behavior of the parser for future operations. + */ + bool threaded{true}; + #endif + + /** + * Unescape this JSON string, replacing \\ with \, \n with newline, etc. to a user-provided buffer. + * The result must be valid UTF-8. + * The provided pointer is advanced to the end of the string by reference, and a string_view instance + * is returned. You can ensure that your buffer is large enough by allocating a block of memory at least + * as large as the input JSON plus SIMDJSON_PADDING and then unescape all strings to this one buffer. + * + * This unescape function is a low-level function. If you want a more user-friendly approach, you should + * avoid raw_json_string instances (e.g., by calling unescaped_key() instead of key() or get_string() + * instead of get_raw_json_string()). + * + * ## IMPORTANT: string_view lifetime + * + * The string_view is only valid as long as the bytes in dst. + * + * @param raw_json_string input + * @param dst A pointer to a buffer at least large enough to write this string as well as + * an additional SIMDJSON_PADDING bytes. + * @param allow_replacement Whether we allow a replacement if the input string contains unmatched surrogate pairs. + * @return A string_view pointing at the unescaped string in dst + * @error STRING_ERROR if escapes are incorrect. + */ + simdjson_inline simdjson_result unescape(raw_json_string in, uint8_t *&dst, bool allow_replacement = false) const noexcept; + + /** + * Unescape this JSON string, replacing \\ with \, \n with newline, etc. to a user-provided buffer. + * The result may not be valid UTF-8. See https://simonsapin.github.io/wtf-8/ + * The provided pointer is advanced to the end of the string by reference, and a string_view instance + * is returned. You can ensure that your buffer is large enough by allocating a block of memory at least + * as large as the input JSON plus SIMDJSON_PADDING and then unescape all strings to this one buffer. + * + * This unescape function is a low-level function. If you want a more user-friendly approach, you should + * avoid raw_json_string instances (e.g., by calling unescaped_key() instead of key() or get_string() + * instead of get_raw_json_string()). + * + * ## IMPORTANT: string_view lifetime + * + * The string_view is only valid as long as the bytes in dst. + * + * @param raw_json_string input + * @param dst A pointer to a buffer at least large enough to write this string as well as + * an additional SIMDJSON_PADDING bytes. + * @return A string_view pointing at the unescaped string in dst + * @error STRING_ERROR if escapes are incorrect. + */ + simdjson_inline simdjson_result unescape_wobbly(raw_json_string in, uint8_t *&dst) const noexcept; + +#if SIMDJSON_DEVELOPMENT_CHECKS + /** + * Returns true if string_buf_loc is outside of the allocated range for the + * the string buffer. When true, it indicates that the string buffer has overflowed. + * This is a development-time check that is not needed in production. It can be + * used to detect buffer overflows in the string buffer and usafe usage of the + * string buffer. + */ + bool string_buffer_overflow(const uint8_t *string_buf_loc) const noexcept; +#endif + +private: + /** @private [for benchmarking access] The implementation to use */ + std::unique_ptr implementation{}; + size_t _capacity{0}; + size_t _max_capacity; + size_t _max_depth{DEFAULT_MAX_DEPTH}; + std::unique_ptr string_buf{}; +#if SIMDJSON_DEVELOPMENT_CHECKS + std::unique_ptr start_positions{}; +#endif + + friend class json_iterator; + friend class document_stream; +}; + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::parser &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_PARSER_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/raw_json_string-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/raw_json_string-inl.h new file mode 100644 index 000000000000..5b814dd801ef --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/raw_json_string-inl.h @@ -0,0 +1,203 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_RAW_JSON_STRING_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_RAW_JSON_STRING_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/raw_json_string.h" +#include "simdjson/generic/ondemand/json_iterator-inl.h" +#include "simdjson/generic/implementation_simdjson_result_base-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { + +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +simdjson_inline raw_json_string::raw_json_string(const uint8_t * _buf) noexcept : buf{_buf} {} + +simdjson_inline const char * raw_json_string::raw() const noexcept { return reinterpret_cast(buf); } + + +simdjson_inline bool raw_json_string::is_free_from_unescaped_quote(std::string_view target) noexcept { + size_t pos{0}; + // if the content has no escape character, just scan through it quickly! + for(;pos < target.size() && target[pos] != '\\';pos++) {} + // slow path may begin. + bool escaping{false}; + for(;pos < target.size();pos++) { + if((target[pos] == '"') && !escaping) { + return false; + } else if(target[pos] == '\\') { + escaping = !escaping; + } else { + escaping = false; + } + } + return true; +} + +simdjson_inline bool raw_json_string::is_free_from_unescaped_quote(const char* target) noexcept { + size_t pos{0}; + // if the content has no escape character, just scan through it quickly! + for(;target[pos] && target[pos] != '\\';pos++) {} + // slow path may begin. + bool escaping{false}; + for(;target[pos];pos++) { + if((target[pos] == '"') && !escaping) { + return false; + } else if(target[pos] == '\\') { + escaping = !escaping; + } else { + escaping = false; + } + } + return true; +} + + +simdjson_inline bool raw_json_string::unsafe_is_equal(size_t length, std::string_view target) const noexcept { + // If we are going to call memcmp, then we must know something about the length of the raw_json_string. + return (length >= target.size()) && (raw()[target.size()] == '"') && !memcmp(raw(), target.data(), target.size()); +} + +simdjson_inline bool raw_json_string::unsafe_is_equal(std::string_view target) const noexcept { + // Assumptions: does not contain unescaped quote characters, and + // the raw content is quote terminated within a valid JSON string. + if(target.size() <= SIMDJSON_PADDING) { + return (raw()[target.size()] == '"') && !memcmp(raw(), target.data(), target.size()); + } + const char * r{raw()}; + size_t pos{0}; + for(;pos < target.size();pos++) { + if(r[pos] != target[pos]) { return false; } + } + if(r[pos] != '"') { return false; } + return true; +} + +simdjson_inline bool raw_json_string::is_equal(std::string_view target) const noexcept { + const char * r{raw()}; + size_t pos{0}; + bool escaping{false}; + for(;pos < target.size();pos++) { + if(r[pos] != target[pos]) { return false; } + // if target is a compile-time constant and it is free from + // quotes, then the next part could get optimized away through + // inlining. + if((target[pos] == '"') && !escaping) { + // We have reached the end of the raw_json_string but + // the target is not done. + return false; + } else if(target[pos] == '\\') { + escaping = !escaping; + } else { + escaping = false; + } + } + if(r[pos] != '"') { return false; } + return true; +} + + +simdjson_inline bool raw_json_string::unsafe_is_equal(const char * target) const noexcept { + // Assumptions: 'target' does not contain unescaped quote characters, is null terminated and + // the raw content is quote terminated within a valid JSON string. + const char * r{raw()}; + size_t pos{0}; + for(;target[pos];pos++) { + if(r[pos] != target[pos]) { return false; } + } + if(r[pos] != '"') { return false; } + return true; +} + +simdjson_inline bool raw_json_string::is_equal(const char* target) const noexcept { + // Assumptions: does not contain unescaped quote characters, and + // the raw content is quote terminated within a valid JSON string. + const char * r{raw()}; + size_t pos{0}; + bool escaping{false}; + for(;target[pos];pos++) { + if(r[pos] != target[pos]) { return false; } + // if target is a compile-time constant and it is free from + // quotes, then the next part could get optimized away through + // inlining. + if((target[pos] == '"') && !escaping) { + // We have reached the end of the raw_json_string but + // the target is not done. + return false; + } else if(target[pos] == '\\') { + escaping = !escaping; + } else { + escaping = false; + } + } + if(r[pos] != '"') { return false; } + return true; +} + +simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept { + return a.unsafe_is_equal(c); +} + +simdjson_unused simdjson_inline bool operator==(std::string_view c, const raw_json_string &a) noexcept { + return a == c; +} + +simdjson_unused simdjson_inline bool operator!=(const raw_json_string &a, std::string_view c) noexcept { + return !(a == c); +} + +simdjson_unused simdjson_inline bool operator!=(std::string_view c, const raw_json_string &a) noexcept { + return !(a == c); +} + + +simdjson_inline simdjson_warn_unused simdjson_result raw_json_string::unescape(json_iterator &iter, bool allow_replacement) const noexcept { + return iter.unescape(*this, allow_replacement); +} + +simdjson_inline simdjson_warn_unused simdjson_result raw_json_string::unescape_wobbly(json_iterator &iter) const noexcept { + return iter.unescape_wobbly(*this); +} + +simdjson_unused simdjson_inline std::ostream &operator<<(std::ostream &out, const raw_json_string &str) noexcept { + bool in_escape = false; + const char *s = str.raw(); + while (true) { + switch (*s) { + case '\\': in_escape = !in_escape; break; + case '"': if (in_escape) { in_escape = false; } else { return out; } break; + default: if (in_escape) { in_escape = false; } + } + out << *s; + s++; + } +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::raw_json_string &&value) noexcept + : implementation_simdjson_result_base(std::forward(value)) {} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : implementation_simdjson_result_base(error) {} + +simdjson_inline simdjson_result simdjson_result::raw() const noexcept { + if (error()) { return error(); } + return first.raw(); +} +simdjson_inline simdjson_warn_unused simdjson_result simdjson_result::unescape(SIMDJSON_IMPLEMENTATION::ondemand::json_iterator &iter, bool allow_replacement) const noexcept { + if (error()) { return error(); } + return first.unescape(iter, allow_replacement); +} +simdjson_inline simdjson_warn_unused simdjson_result simdjson_result::unescape_wobbly(SIMDJSON_IMPLEMENTATION::ondemand::json_iterator &iter) const noexcept { + if (error()) { return error(); } + return first.unescape_wobbly(iter); +} +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_RAW_JSON_STRING_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/raw_json_string.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/raw_json_string.h new file mode 100644 index 000000000000..be50a402d378 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/raw_json_string.h @@ -0,0 +1,206 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_RAW_JSON_STRING_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_RAW_JSON_STRING_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * A string escaped per JSON rules, terminated with quote ("). They are used to represent + * unescaped keys inside JSON documents. + * + * (In other words, a pointer to the beginning of a string, just after the start quote, inside a + * JSON file.) + * + * This class is deliberately simplistic and has little functionality. You can + * compare a raw_json_string instance with an unescaped C string, but + * that is nearly all you can do. + * + * The raw_json_string is unescaped. If you wish to write an unescaped version of it to your own + * buffer, you may do so using the parser.unescape(string, buff) method, using an ondemand::parser + * instance. Doing so requires you to have a sufficiently large buffer. + * + * The raw_json_string instances originate typically from field instance which in turn represent + * key-value pairs from object instances. From a field instance, you get the raw_json_string + * instance by calling key(). You can, if you want a more usable string_view instance, call + * the unescaped_key() method on the field instance. You may also create a raw_json_string from + * any other string value, with the value.get_raw_json_string() method. Again, you can get + * a more usable string_view instance by calling get_string(). + * + */ +class raw_json_string { +public: + /** + * Create a new invalid raw_json_string. + * + * Exists so you can declare a variable and later assign to it before use. + */ + simdjson_inline raw_json_string() noexcept = default; + + /** + * Create a new invalid raw_json_string pointed at the given location in the JSON. + * + * The given location must be just *after* the beginning quote (") in the JSON file. + * + * It *must* be terminated by a ", and be a valid JSON string. + */ + simdjson_inline raw_json_string(const uint8_t * _buf) noexcept; + /** + * Get the raw pointer to the beginning of the string in the JSON (just after the "). + * + * It is possible for this function to return a null pointer if the instance + * has outlived its existence. + */ + simdjson_inline const char * raw() const noexcept; + + /** + * This compares the current instance to the std::string_view target: returns true if + * they are byte-by-byte equal (no escaping is done) on target.size() characters, + * and if the raw_json_string instance has a quote character at byte index target.size(). + * We never read more than length + 1 bytes in the raw_json_string instance. + * If length is smaller than target.size(), this will return false. + * + * The std::string_view instance may contain any characters. However, the caller + * is responsible for setting length so that length bytes may be read in the + * raw_json_string. + * + * Performance: the comparison may be done using memcmp which may be efficient + * for long strings. + */ + simdjson_inline bool unsafe_is_equal(size_t length, std::string_view target) const noexcept; + + /** + * This compares the current instance to the std::string_view target: returns true if + * they are byte-by-byte equal (no escaping is done). + * The std::string_view instance should not contain unescaped quote characters: + * the caller is responsible for this check. See is_free_from_unescaped_quote. + * + * Performance: the comparison is done byte-by-byte which might be inefficient for + * long strings. + * + * If target is a compile-time constant, and your compiler likes you, + * you should be able to do the following without performance penalty... + * + * static_assert(raw_json_string::is_free_from_unescaped_quote(target), ""); + * s.unsafe_is_equal(target); + */ + simdjson_inline bool unsafe_is_equal(std::string_view target) const noexcept; + + /** + * This compares the current instance to the C string target: returns true if + * they are byte-by-byte equal (no escaping is done). + * The provided C string should not contain an unescaped quote character: + * the caller is responsible for this check. See is_free_from_unescaped_quote. + * + * If target is a compile-time constant, and your compiler likes you, + * you should be able to do the following without performance penalty... + * + * static_assert(raw_json_string::is_free_from_unescaped_quote(target), ""); + * s.unsafe_is_equal(target); + */ + simdjson_inline bool unsafe_is_equal(const char* target) const noexcept; + + /** + * This compares the current instance to the std::string_view target: returns true if + * they are byte-by-byte equal (no escaping is done). + */ + simdjson_inline bool is_equal(std::string_view target) const noexcept; + + /** + * This compares the current instance to the C string target: returns true if + * they are byte-by-byte equal (no escaping is done). + */ + simdjson_inline bool is_equal(const char* target) const noexcept; + + /** + * Returns true if target is free from unescaped quote. If target is known at + * compile-time, we might expect the computation to happen at compile time with + * many compilers (not all!). + */ + static simdjson_inline bool is_free_from_unescaped_quote(std::string_view target) noexcept; + static simdjson_inline bool is_free_from_unescaped_quote(const char* target) noexcept; + +private: + + + /** + * This will set the inner pointer to zero, effectively making + * this instance unusable. + */ + simdjson_inline void consume() noexcept { buf = nullptr; } + + /** + * Checks whether the inner pointer is non-null and thus usable. + */ + simdjson_inline simdjson_warn_unused bool alive() const noexcept { return buf != nullptr; } + + /** + * Unescape this JSON string, replacing \\ with \, \n with newline, etc. + * The result will be a valid UTF-8. + * + * ## IMPORTANT: string_view lifetime + * + * The string_view is only valid until the next parse() call on the parser. + * + * @param iter A json_iterator, which contains a buffer where the string will be written. + * @param allow_replacement Whether we allow replacement of invalid surrogate pairs. + */ + simdjson_inline simdjson_warn_unused simdjson_result unescape(json_iterator &iter, bool allow_replacement) const noexcept; + + /** + * Unescape this JSON string, replacing \\ with \, \n with newline, etc. + * The result may not be a valid UTF-8. https://simonsapin.github.io/wtf-8/ + * + * ## IMPORTANT: string_view lifetime + * + * The string_view is only valid until the next parse() call on the parser. + * + * @param iter A json_iterator, which contains a buffer where the string will be written. + */ + simdjson_inline simdjson_warn_unused simdjson_result unescape_wobbly(json_iterator &iter) const noexcept; + const uint8_t * buf{}; + friend class object; + friend class field; + friend class parser; + friend struct simdjson_result; +}; + +simdjson_unused simdjson_inline std::ostream &operator<<(std::ostream &, const raw_json_string &) noexcept; + +/** + * Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user is responsible + * for providing a string with no unescaped quote. Note that unescaped quotes cannot be present in valid JSON strings. + */ +simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept; +simdjson_unused simdjson_inline bool operator==(std::string_view c, const raw_json_string &a) noexcept; +simdjson_unused simdjson_inline bool operator!=(const raw_json_string &a, std::string_view c) noexcept; +simdjson_unused simdjson_inline bool operator!=(std::string_view c, const raw_json_string &a) noexcept; + + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::raw_json_string &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; + simdjson_inline ~simdjson_result() noexcept = default; ///< @private + + simdjson_inline simdjson_result raw() const noexcept; + simdjson_inline simdjson_warn_unused simdjson_result unescape(SIMDJSON_IMPLEMENTATION::ondemand::json_iterator &iter, bool allow_replacement) const noexcept; + simdjson_inline simdjson_warn_unused simdjson_result unescape_wobbly(SIMDJSON_IMPLEMENTATION::ondemand::json_iterator &iter) const noexcept; +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_RAW_JSON_STRING_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/serialization-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/serialization-inl.h new file mode 100644 index 000000000000..77be39a11ced --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/serialization-inl.h @@ -0,0 +1,233 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_SERIALIZATION_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_SERIALIZATION_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/array.h" +#include "simdjson/generic/ondemand/document-inl.h" +#include "simdjson/generic/ondemand/json_type.h" +#include "simdjson/generic/ondemand/object.h" +#include "simdjson/generic/ondemand/serialization.h" +#include "simdjson/generic/ondemand/value.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { + +inline std::string_view trim(const std::string_view str) noexcept { + // We can almost surely do better by rolling our own find_first_not_of function. + size_t first = str.find_first_not_of(" \t\n\r"); + // If we have the empty string (just white space), then no trimming is possible, and + // we return the empty string_view. + if (std::string_view::npos == first) { return std::string_view(); } + size_t last = str.find_last_not_of(" \t\n\r"); + return str.substr(first, (last - first + 1)); +} + + +inline simdjson_result to_json_string(SIMDJSON_IMPLEMENTATION::ondemand::document& x) noexcept { + std::string_view v; + auto error = x.raw_json().get(v); + if(error) {return error; } + return trim(v); +} + +inline simdjson_result to_json_string(SIMDJSON_IMPLEMENTATION::ondemand::document_reference& x) noexcept { + std::string_view v; + auto error = x.raw_json().get(v); + if(error) {return error; } + return trim(v); +} + +inline simdjson_result to_json_string(SIMDJSON_IMPLEMENTATION::ondemand::value& x) noexcept { + /** + * If we somehow receive a value that has already been consumed, + * then the following code could be in trouble. E.g., we create + * an array as needed, but if an array was already created, then + * it could be bad. + */ + using namespace SIMDJSON_IMPLEMENTATION::ondemand; + SIMDJSON_IMPLEMENTATION::ondemand::json_type t; + auto error = x.type().get(t); + if(error != SUCCESS) { return error; } + switch (t) + { + case json_type::array: + { + SIMDJSON_IMPLEMENTATION::ondemand::array array; + error = x.get_array().get(array); + if(error) { return error; } + return to_json_string(array); + } + case json_type::object: + { + SIMDJSON_IMPLEMENTATION::ondemand::object object; + error = x.get_object().get(object); + if(error) { return error; } + return to_json_string(object); + } + default: + return trim(x.raw_json_token()); + } +} + +inline simdjson_result to_json_string(SIMDJSON_IMPLEMENTATION::ondemand::object& x) noexcept { + std::string_view v; + auto error = x.raw_json().get(v); + if(error) {return error; } + return trim(v); +} + +inline simdjson_result to_json_string(SIMDJSON_IMPLEMENTATION::ondemand::array& x) noexcept { + std::string_view v; + auto error = x.raw_json().get(v); + if(error) {return error; } + return trim(v); +} + +inline simdjson_result to_json_string(simdjson_result x) { + if (x.error()) { return x.error(); } + return to_json_string(x.value_unsafe()); +} + +inline simdjson_result to_json_string(simdjson_result x) { + if (x.error()) { return x.error(); } + return to_json_string(x.value_unsafe()); +} + +inline simdjson_result to_json_string(simdjson_result x) { + if (x.error()) { return x.error(); } + return to_json_string(x.value_unsafe()); +} + +inline simdjson_result to_json_string(simdjson_result x) { + if (x.error()) { return x.error(); } + return to_json_string(x.value_unsafe()); +} + +inline simdjson_result to_json_string(simdjson_result x) { + if (x.error()) { return x.error(); } + return to_json_string(x.value_unsafe()); +} +} // namespace simdjson + +namespace simdjson { namespace SIMDJSON_IMPLEMENTATION { namespace ondemand { + +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::value x) { + std::string_view v; + auto error = simdjson::to_json_string(x).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + throw simdjson::simdjson_error(error); + } +} +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +#else +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::value x) { + std::string_view v; + auto error = simdjson::to_json_string(x).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + return (out << error); + } +} +#endif + +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::array value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + throw simdjson::simdjson_error(error); + } +} +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +#else +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::array value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + return (out << error); + } +} +#endif + +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::document& value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + throw simdjson::simdjson_error(error); + } +} +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::document_reference& value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + throw simdjson::simdjson_error(error); + } +} +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result&& x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result&& x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +#else +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::document& value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + return (out << error); + } +} +#endif + +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::object value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + throw simdjson::simdjson_error(error); + } +} +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x) { + if (x.error()) { throw simdjson::simdjson_error(x.error()); } + return (out << x.value()); +} +#else +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::object value) { + std::string_view v; + auto error = simdjson::to_json_string(value).get(v); + if(error == simdjson::SUCCESS) { + return (out << v); + } else { + return (out << error); + } +} +#endif +}}} // namespace simdjson::SIMDJSON_IMPLEMENTATION::ondemand + +#endif // SIMDJSON_GENERIC_ONDEMAND_SERIALIZATION_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/serialization.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/serialization.h new file mode 100644 index 000000000000..048c73cda81b --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/serialization.h @@ -0,0 +1,103 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_SERIALIZATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_SERIALIZATION_H +#include "simdjson/generic/ondemand/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +/** + * Create a string-view instance out of a document instance. The string-view instance + * contains JSON text that is suitable to be parsed as JSON again. It does not + * validate the content. + */ +inline simdjson_result to_json_string(SIMDJSON_IMPLEMENTATION::ondemand::document& x) noexcept; +/** + * Create a string-view instance out of a value instance. The string-view instance + * contains JSON text that is suitable to be parsed as JSON again. The value must + * not have been accessed previously. It does not + * validate the content. + */ +inline simdjson_result to_json_string(SIMDJSON_IMPLEMENTATION::ondemand::value& x) noexcept; +/** + * Create a string-view instance out of an object instance. The string-view instance + * contains JSON text that is suitable to be parsed as JSON again. It does not + * validate the content. + */ +inline simdjson_result to_json_string(SIMDJSON_IMPLEMENTATION::ondemand::object& x) noexcept; +/** + * Create a string-view instance out of an array instance. The string-view instance + * contains JSON text that is suitable to be parsed as JSON again. It does not + * validate the content. + */ +inline simdjson_result to_json_string(SIMDJSON_IMPLEMENTATION::ondemand::array& x) noexcept; +inline simdjson_result to_json_string(simdjson_result x); +inline simdjson_result to_json_string(simdjson_result x); +inline simdjson_result to_json_string(simdjson_result x); +inline simdjson_result to_json_string(simdjson_result x); +} // namespace simdjson + +/** + * We want to support argument-dependent lookup (ADL). + * Hence we should define operator<< in the namespace + * where the argument (here value, object, etc.) resides. + * Credit: @madhur4127 + * See https://github.com/simdjson/simdjson/issues/1768 + */ +namespace simdjson { namespace SIMDJSON_IMPLEMENTATION { namespace ondemand { + +/** + * Print JSON to an output stream. It does not + * validate the content. + * + * @param out The output stream. + * @param value The element. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::value x); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x); +#endif +/** + * Print JSON to an output stream. It does not + * validate the content. + * + * @param out The output stream. + * @param value The array. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::array value); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x); +#endif +/** + * Print JSON to an output stream. It does not + * validate the content. + * + * @param out The output stream. + * @param value The array. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::document& value); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result&& x); +#endif +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::document_reference& value); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result&& x); +#endif +/** + * Print JSON to an output stream. It does not + * validate the content. + * + * @param out The output stream. + * @param value The object. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_IMPLEMENTATION::ondemand::object value); +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result x); +#endif +}}} // namespace simdjson::SIMDJSON_IMPLEMENTATION::ondemand + +#endif // SIMDJSON_GENERIC_ONDEMAND_SERIALIZATION_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/token_iterator-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/token_iterator-inl.h new file mode 100644 index 000000000000..c93a10d82901 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/token_iterator-inl.h @@ -0,0 +1,94 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_TOKEN_ITERATOR_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_TOKEN_ITERATOR_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/token_iterator.h" +#include "simdjson/generic/implementation_simdjson_result_base-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +simdjson_inline token_iterator::token_iterator( + const uint8_t *_buf, + token_position position +) noexcept : buf{_buf}, _position{position} +{ +} + +simdjson_inline uint32_t token_iterator::current_offset() const noexcept { + return *(_position); +} + + +simdjson_inline const uint8_t *token_iterator::return_current_and_advance() noexcept { + return &buf[*(_position++)]; +} + +simdjson_inline const uint8_t *token_iterator::peek(token_position position) const noexcept { + return &buf[*position]; +} +simdjson_inline uint32_t token_iterator::peek_index(token_position position) const noexcept { + return *position; +} +simdjson_inline uint32_t token_iterator::peek_length(token_position position) const noexcept { + return *(position+1) - *position; +} + +simdjson_inline uint32_t token_iterator::peek_root_length(token_position position) const noexcept { + return *(position+2) - *(position) > *(position+1) - *(position) ? + *(position+1) - *(position) + : *(position+2) - *(position); +} +simdjson_inline const uint8_t *token_iterator::peek(int32_t delta) const noexcept { + return &buf[*(_position+delta)]; +} +simdjson_inline uint32_t token_iterator::peek_index(int32_t delta) const noexcept { + return *(_position+delta); +} +simdjson_inline uint32_t token_iterator::peek_length(int32_t delta) const noexcept { + return *(_position+delta+1) - *(_position+delta); +} + +simdjson_inline token_position token_iterator::position() const noexcept { + return _position; +} +simdjson_inline void token_iterator::set_position(token_position target_position) noexcept { + _position = target_position; +} + +simdjson_inline bool token_iterator::operator==(const token_iterator &other) const noexcept { + return _position == other._position; +} +simdjson_inline bool token_iterator::operator!=(const token_iterator &other) const noexcept { + return _position != other._position; +} +simdjson_inline bool token_iterator::operator>(const token_iterator &other) const noexcept { + return _position > other._position; +} +simdjson_inline bool token_iterator::operator>=(const token_iterator &other) const noexcept { + return _position >= other._position; +} +simdjson_inline bool token_iterator::operator<(const token_iterator &other) const noexcept { + return _position < other._position; +} +simdjson_inline bool token_iterator::operator<=(const token_iterator &other) const noexcept { + return _position <= other._position; +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::token_iterator &&value) noexcept + : implementation_simdjson_result_base(std::forward(value)) {} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : implementation_simdjson_result_base(error) {} + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_TOKEN_ITERATOR_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/token_iterator.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/token_iterator.h new file mode 100644 index 000000000000..dc1b4fac4cbf --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/token_iterator.h @@ -0,0 +1,158 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_TOKEN_ITERATOR_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_TOKEN_ITERATOR_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#include "simdjson/generic/ondemand/logger.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * Iterates through JSON tokens (`{` `}` `[` `]` `,` `:` `""` `123` `true` `false` `null`) + * detected by stage 1. + * + * @private This is not intended for external use. + */ +class token_iterator { +public: + /** + * Create a new invalid token_iterator. + * + * Exists so you can declare a variable and later assign to it before use. + */ + simdjson_inline token_iterator() noexcept = default; + simdjson_inline token_iterator(token_iterator &&other) noexcept = default; + simdjson_inline token_iterator &operator=(token_iterator &&other) noexcept = default; + simdjson_inline token_iterator(const token_iterator &other) noexcept = default; + simdjson_inline token_iterator &operator=(const token_iterator &other) noexcept = default; + + /** + * Advance to the next token (returning the current one). + */ + simdjson_inline const uint8_t *return_current_and_advance() noexcept; + /** + * Reports the current offset in bytes from the start of the underlying buffer. + */ + simdjson_inline uint32_t current_offset() const noexcept; + /** + * Get the JSON text for a given token (relative). + * + * This is not null-terminated; it is a view into the JSON. + * + * @param delta The relative position of the token to retrieve. e.g. 0 = current token, + * 1 = next token, -1 = prev token. + * + * TODO consider a string_view, assuming the length will get stripped out by the optimizer when + * it is not used... + */ + simdjson_inline const uint8_t *peek(int32_t delta=0) const noexcept; + /** + * Get the maximum length of the JSON text for a given token. + * + * The length will include any whitespace at the end of the token. + * + * @param delta The relative position of the token to retrieve. e.g. 0 = current token, + * 1 = next token, -1 = prev token. + */ + simdjson_inline uint32_t peek_length(int32_t delta=0) const noexcept; + + /** + * Get the JSON text for a given token. + * + * This is not null-terminated; it is a view into the JSON. + * + * @param position The position of the token. + * + */ + simdjson_inline const uint8_t *peek(token_position position) const noexcept; + /** + * Get the maximum length of the JSON text for a given token. + * + * The length will include any whitespace at the end of the token. + * + * @param position The position of the token. + */ + simdjson_inline uint32_t peek_length(token_position position) const noexcept; + /** + * Get the maximum length of the JSON text for a root token. + * + * The length will include any whitespace at the end of the token. + * + * @param position The position of the token (start of the document). + */ + simdjson_inline uint32_t peek_root_length(token_position position) const noexcept; + /** + * Return the current index. + */ + simdjson_inline token_position position() const noexcept; + /** + * Reset to a previously saved index. + */ + simdjson_inline void set_position(token_position target_position) noexcept; + + // NOTE: we don't support a full C++ iterator interface, because we expect people to make + // different calls to advance the iterator based on *their own* state. + + simdjson_inline bool operator==(const token_iterator &other) const noexcept; + simdjson_inline bool operator!=(const token_iterator &other) const noexcept; + simdjson_inline bool operator>(const token_iterator &other) const noexcept; + simdjson_inline bool operator>=(const token_iterator &other) const noexcept; + simdjson_inline bool operator<(const token_iterator &other) const noexcept; + simdjson_inline bool operator<=(const token_iterator &other) const noexcept; + +protected: + simdjson_inline token_iterator(const uint8_t *buf, token_position position) noexcept; + + /** + * Get the index of the JSON text for a given token (relative). + * + * This is not null-terminated; it is a view into the JSON. + * + * @param delta The relative position of the token to retrieve. e.g. 0 = current token, + * 1 = next token, -1 = prev token. + */ + simdjson_inline uint32_t peek_index(int32_t delta=0) const noexcept; + /** + * Get the index of the JSON text for a given token. + * + * This is not null-terminated; it is a view into the JSON. + * + * @param position The position of the token. + * + */ + simdjson_inline uint32_t peek_index(token_position position) const noexcept; + + const uint8_t *buf{}; + token_position _position{}; + + friend class json_iterator; + friend class value_iterator; + friend class object; + template + friend simdjson_inline void logger::log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta, logger::log_level level, Args&&... args) noexcept; + template + friend simdjson_inline void logger::log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail, logger::log_level level, Args&&... args) noexcept; +}; + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::token_iterator &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; + simdjson_inline ~simdjson_result() noexcept = default; ///< @private +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_TOKEN_ITERATOR_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/value-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/value-inl.h new file mode 100644 index 000000000000..886bb9d050ac --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/value-inl.h @@ -0,0 +1,542 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_VALUE_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_VALUE_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/ondemand/array.h" +#include "simdjson/generic/ondemand/array_iterator.h" +#include "simdjson/generic/ondemand/json_iterator.h" +#include "simdjson/generic/ondemand/json_type.h" +#include "simdjson/generic/ondemand/object.h" +#include "simdjson/generic/ondemand/raw_json_string.h" +#include "simdjson/generic/ondemand/value.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +simdjson_inline value::value(const value_iterator &_iter) noexcept + : iter{_iter} +{ +} +simdjson_inline value value::start(const value_iterator &iter) noexcept { + return iter; +} +simdjson_inline value value::resume(const value_iterator &iter) noexcept { + return iter; +} + +simdjson_inline simdjson_result value::get_array() noexcept { + return array::start(iter); +} +simdjson_inline simdjson_result value::get_object() noexcept { + return object::start(iter); +} +simdjson_inline simdjson_result value::start_or_resume_object() noexcept { + if (iter.at_start()) { + return get_object(); + } else { + return object::resume(iter); + } +} + +simdjson_inline simdjson_result value::get_raw_json_string() noexcept { + return iter.get_raw_json_string(); +} +simdjson_inline simdjson_result value::get_string(bool allow_replacement) noexcept { + return iter.get_string(allow_replacement); +} +template +simdjson_inline error_code value::get_string(string_type& receiver, bool allow_replacement) noexcept { + return iter.get_string(receiver, allow_replacement); +} +simdjson_inline simdjson_result value::get_wobbly_string() noexcept { + return iter.get_wobbly_string(); +} +simdjson_inline simdjson_result value::get_double() noexcept { + return iter.get_double(); +} +simdjson_inline simdjson_result value::get_double_in_string() noexcept { + return iter.get_double_in_string(); +} +simdjson_inline simdjson_result value::get_uint64() noexcept { + return iter.get_uint64(); +} +simdjson_inline simdjson_result value::get_uint64_in_string() noexcept { + return iter.get_uint64_in_string(); +} +simdjson_inline simdjson_result value::get_int64() noexcept { + return iter.get_int64(); +} +simdjson_inline simdjson_result value::get_int64_in_string() noexcept { + return iter.get_int64_in_string(); +} +simdjson_inline simdjson_result value::get_bool() noexcept { + return iter.get_bool(); +} +simdjson_inline simdjson_result value::is_null() noexcept { + return iter.is_null(); +} +template<> simdjson_inline simdjson_result value::get() noexcept { return get_array(); } +template<> simdjson_inline simdjson_result value::get() noexcept { return get_object(); } +template<> simdjson_inline simdjson_result value::get() noexcept { return get_raw_json_string(); } +template<> simdjson_inline simdjson_result value::get() noexcept { return get_string(false); } +template<> simdjson_inline simdjson_result value::get() noexcept { return get_number(); } +template<> simdjson_inline simdjson_result value::get() noexcept { return get_double(); } +template<> simdjson_inline simdjson_result value::get() noexcept { return get_uint64(); } +template<> simdjson_inline simdjson_result value::get() noexcept { return get_int64(); } +template<> simdjson_inline simdjson_result value::get() noexcept { return get_bool(); } + +template simdjson_inline error_code value::get(T &out) noexcept { + return get().get(out); +} + +#if SIMDJSON_EXCEPTIONS +template +simdjson_inline value::operator T() noexcept(false) { + return get(); +} +simdjson_inline value::operator array() noexcept(false) { + return get_array(); +} +simdjson_inline value::operator object() noexcept(false) { + return get_object(); +} +simdjson_inline value::operator uint64_t() noexcept(false) { + return get_uint64(); +} +simdjson_inline value::operator int64_t() noexcept(false) { + return get_int64(); +} +simdjson_inline value::operator double() noexcept(false) { + return get_double(); +} +simdjson_inline value::operator std::string_view() noexcept(false) { + return get_string(false); +} +simdjson_inline value::operator raw_json_string() noexcept(false) { + return get_raw_json_string(); +} +simdjson_inline value::operator bool() noexcept(false) { + return get_bool(); +} +#endif + +simdjson_inline simdjson_result value::begin() & noexcept { + return get_array().begin(); +} +simdjson_inline simdjson_result value::end() & noexcept { + return {}; +} +simdjson_inline simdjson_result value::count_elements() & noexcept { + simdjson_result answer; + auto a = get_array(); + answer = a.count_elements(); + // count_elements leaves you pointing inside the array, at the first element. + // We need to move back so that the user can create a new array (which requires that + // we point at '['). + iter.move_at_start(); + return answer; +} +simdjson_inline simdjson_result value::count_fields() & noexcept { + simdjson_result answer; + auto a = get_object(); + answer = a.count_fields(); + iter.move_at_start(); + return answer; +} +simdjson_inline simdjson_result value::at(size_t index) noexcept { + auto a = get_array(); + return a.at(index); +} + +simdjson_inline simdjson_result value::find_field(std::string_view key) noexcept { + return start_or_resume_object().find_field(key); +} +simdjson_inline simdjson_result value::find_field(const char *key) noexcept { + return start_or_resume_object().find_field(key); +} + +simdjson_inline simdjson_result value::find_field_unordered(std::string_view key) noexcept { + return start_or_resume_object().find_field_unordered(key); +} +simdjson_inline simdjson_result value::find_field_unordered(const char *key) noexcept { + return start_or_resume_object().find_field_unordered(key); +} + +simdjson_inline simdjson_result value::operator[](std::string_view key) noexcept { + return start_or_resume_object()[key]; +} +simdjson_inline simdjson_result value::operator[](const char *key) noexcept { + return start_or_resume_object()[key]; +} + +simdjson_inline simdjson_result value::type() noexcept { + return iter.type(); +} + +simdjson_inline simdjson_result value::is_scalar() noexcept { + json_type this_type; + auto error = type().get(this_type); + if(error) { return error; } + return ! ((this_type == json_type::array) || (this_type == json_type::object)); +} + +simdjson_inline simdjson_result value::is_string() noexcept { + json_type this_type; + auto error = type().get(this_type); + if(error) { return error; } + return (this_type == json_type::string); +} + + +simdjson_inline bool value::is_negative() noexcept { + return iter.is_negative(); +} + +simdjson_inline simdjson_result value::is_integer() noexcept { + return iter.is_integer(); +} +simdjson_warn_unused simdjson_inline simdjson_result value::get_number_type() noexcept { + return iter.get_number_type(); +} +simdjson_warn_unused simdjson_inline simdjson_result value::get_number() noexcept { + return iter.get_number(); +} + +simdjson_inline std::string_view value::raw_json_token() noexcept { + return std::string_view(reinterpret_cast(iter.peek_start()), iter.peek_start_length()); +} + +simdjson_inline simdjson_result value::raw_json() noexcept { + json_type t; + SIMDJSON_TRY(type().get(t)); + switch (t) + { + case json_type::array: { + ondemand::array array; + SIMDJSON_TRY(get_array().get(array)); + return array.raw_json(); + } + case json_type::object: { + ondemand::object object; + SIMDJSON_TRY(get_object().get(object)); + return object.raw_json(); + } + default: + return raw_json_token(); + } +} + +simdjson_inline simdjson_result value::current_location() noexcept { + return iter.json_iter().current_location(); +} + +simdjson_inline int32_t value::current_depth() const noexcept{ + return iter.json_iter().depth(); +} + +inline bool is_pointer_well_formed(std::string_view json_pointer) noexcept { + if (simdjson_unlikely(json_pointer.empty())) { // can't be + return false; + } + if (simdjson_unlikely(json_pointer[0] != '/')) { + return false; + } + size_t escape = json_pointer.find('~'); + if (escape == std::string_view::npos) { + return true; + } + if (escape == json_pointer.size() - 1) { + return false; + } + if (json_pointer[escape + 1] != '0' && json_pointer[escape + 1] != '1') { + return false; + } + return true; +} + +simdjson_inline simdjson_result value::at_pointer(std::string_view json_pointer) noexcept { + json_type t; + SIMDJSON_TRY(type().get(t)); + switch (t) + { + case json_type::array: + return (*this).get_array().at_pointer(json_pointer); + case json_type::object: + return (*this).get_object().at_pointer(json_pointer); + default: + // a non-empty string can be invalid, or accessing a primitive (issue 2154) + if (is_pointer_well_formed(json_pointer)) { + return NO_SUCH_FIELD; + } + return INVALID_JSON_POINTER; + } +} + +simdjson_inline simdjson_result value::at_path(std::string_view json_path) noexcept { + json_type t; + SIMDJSON_TRY(type().get(t)); + switch (t) { + case json_type::array: + return (*this).get_array().at_path(json_path); + case json_type::object: + return (*this).get_object().at_path(json_path); + default: + return INVALID_JSON_POINTER; + } +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result( + SIMDJSON_IMPLEMENTATION::ondemand::value &&value +) noexcept : + implementation_simdjson_result_base( + std::forward(value) + ) +{ +} +simdjson_inline simdjson_result::simdjson_result( + error_code error +) noexcept : + implementation_simdjson_result_base(error) +{ +} +simdjson_inline simdjson_result simdjson_result::count_elements() & noexcept { + if (error()) { return error(); } + return first.count_elements(); +} +simdjson_inline simdjson_result simdjson_result::count_fields() & noexcept { + if (error()) { return error(); } + return first.count_fields(); +} +simdjson_inline simdjson_result simdjson_result::at(size_t index) noexcept { + if (error()) { return error(); } + return first.at(index); +} +simdjson_inline simdjson_result simdjson_result::begin() & noexcept { + if (error()) { return error(); } + return first.begin(); +} +simdjson_inline simdjson_result simdjson_result::end() & noexcept { + if (error()) { return error(); } + return {}; +} + +simdjson_inline simdjson_result simdjson_result::find_field(std::string_view key) noexcept { + if (error()) { return error(); } + return first.find_field(key); +} +simdjson_inline simdjson_result simdjson_result::find_field(const char *key) noexcept { + if (error()) { return error(); } + return first.find_field(key); +} + +simdjson_inline simdjson_result simdjson_result::find_field_unordered(std::string_view key) noexcept { + if (error()) { return error(); } + return first.find_field_unordered(key); +} +simdjson_inline simdjson_result simdjson_result::find_field_unordered(const char *key) noexcept { + if (error()) { return error(); } + return first.find_field_unordered(key); +} + +simdjson_inline simdjson_result simdjson_result::operator[](std::string_view key) noexcept { + if (error()) { return error(); } + return first[key]; +} +simdjson_inline simdjson_result simdjson_result::operator[](const char *key) noexcept { + if (error()) { return error(); } + return first[key]; +} + +simdjson_inline simdjson_result simdjson_result::get_array() noexcept { + if (error()) { return error(); } + return first.get_array(); +} +simdjson_inline simdjson_result simdjson_result::get_object() noexcept { + if (error()) { return error(); } + return first.get_object(); +} +simdjson_inline simdjson_result simdjson_result::get_uint64() noexcept { + if (error()) { return error(); } + return first.get_uint64(); +} +simdjson_inline simdjson_result simdjson_result::get_uint64_in_string() noexcept { + if (error()) { return error(); } + return first.get_uint64_in_string(); +} +simdjson_inline simdjson_result simdjson_result::get_int64() noexcept { + if (error()) { return error(); } + return first.get_int64(); +} +simdjson_inline simdjson_result simdjson_result::get_int64_in_string() noexcept { + if (error()) { return error(); } + return first.get_int64_in_string(); +} +simdjson_inline simdjson_result simdjson_result::get_double() noexcept { + if (error()) { return error(); } + return first.get_double(); +} +simdjson_inline simdjson_result simdjson_result::get_double_in_string() noexcept { + if (error()) { return error(); } + return first.get_double_in_string(); +} +simdjson_inline simdjson_result simdjson_result::get_string(bool allow_replacement) noexcept { + if (error()) { return error(); } + return first.get_string(allow_replacement); +} +template +simdjson_inline error_code simdjson_result::get_string(string_type& receiver, bool allow_replacement) noexcept { + if (error()) { return error(); } + return first.get_string(receiver, allow_replacement); +} +simdjson_inline simdjson_result simdjson_result::get_wobbly_string() noexcept { + if (error()) { return error(); } + return first.get_wobbly_string(); +} +simdjson_inline simdjson_result simdjson_result::get_raw_json_string() noexcept { + if (error()) { return error(); } + return first.get_raw_json_string(); +} +simdjson_inline simdjson_result simdjson_result::get_bool() noexcept { + if (error()) { return error(); } + return first.get_bool(); +} +simdjson_inline simdjson_result simdjson_result::is_null() noexcept { + if (error()) { return error(); } + return first.is_null(); +} + +template simdjson_inline simdjson_result simdjson_result::get() noexcept { + if (error()) { return error(); } + return first.get(); +} +template simdjson_inline error_code simdjson_result::get(T &out) noexcept { + if (error()) { return error(); } + return first.get(out); +} + +template<> simdjson_inline simdjson_result simdjson_result::get() noexcept { + if (error()) { return error(); } + return std::move(first); +} +template<> simdjson_inline error_code simdjson_result::get(SIMDJSON_IMPLEMENTATION::ondemand::value &out) noexcept { + if (error()) { return error(); } + out = first; + return SUCCESS; +} + +simdjson_inline simdjson_result simdjson_result::type() noexcept { + if (error()) { return error(); } + return first.type(); +} +simdjson_inline simdjson_result simdjson_result::is_scalar() noexcept { + if (error()) { return error(); } + return first.is_scalar(); +} +simdjson_inline simdjson_result simdjson_result::is_string() noexcept { + if (error()) { return error(); } + return first.is_string(); +} +simdjson_inline simdjson_result simdjson_result::is_negative() noexcept { + if (error()) { return error(); } + return first.is_negative(); +} +simdjson_inline simdjson_result simdjson_result::is_integer() noexcept { + if (error()) { return error(); } + return first.is_integer(); +} +simdjson_inline simdjson_result simdjson_result::get_number_type() noexcept { + if (error()) { return error(); } + return first.get_number_type(); +} +simdjson_inline simdjson_result simdjson_result::get_number() noexcept { + if (error()) { return error(); } + return first.get_number(); +} +#if SIMDJSON_EXCEPTIONS +template +simdjson_inline simdjson_result::operator T() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return static_cast(first); +} +simdjson_inline simdjson_result::operator SIMDJSON_IMPLEMENTATION::ondemand::array() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator SIMDJSON_IMPLEMENTATION::ondemand::object() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator uint64_t() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator int64_t() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator double() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator std::string_view() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator SIMDJSON_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +simdjson_inline simdjson_result::operator bool() noexcept(false) { + if (error()) { throw simdjson_error(error()); } + return first; +} +#endif + +simdjson_inline simdjson_result simdjson_result::raw_json_token() noexcept { + if (error()) { return error(); } + return first.raw_json_token(); +} + +simdjson_inline simdjson_result simdjson_result::raw_json() noexcept { + if (error()) { return error(); } + return first.raw_json(); +} + +simdjson_inline simdjson_result simdjson_result::current_location() noexcept { + if (error()) { return error(); } + return first.current_location(); +} + +simdjson_inline simdjson_result simdjson_result::current_depth() const noexcept { + if (error()) { return error(); } + return first.current_depth(); +} + +simdjson_inline simdjson_result simdjson_result::at_pointer( + std::string_view json_pointer) noexcept { + if (error()) { + return error(); + } + return first.at_pointer(json_pointer); +} + +simdjson_inline simdjson_result simdjson_result::at_path( + std::string_view json_path) noexcept { + if (error()) { + return error(); + } + return first.at_path(json_path); +} + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_VALUE_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/value.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/value.h new file mode 100644 index 000000000000..bd6b74ed3268 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/value.h @@ -0,0 +1,781 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_VALUE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_VALUE_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#include "simdjson/generic/ondemand/value_iterator.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * An ephemeral JSON value returned during iteration. It is only valid for as long as you do + * not access more data in the JSON document. + */ +class value { +public: + /** + * Create a new invalid value. + * + * Exists so you can declare a variable and later assign to it before use. + */ + simdjson_inline value() noexcept = default; + + /** + * Get this value as the given type. + * + * Supported types: object, array, raw_json_string, string_view, uint64_t, int64_t, double, bool + * + * You may use get_double(), get_bool(), get_uint64(), get_int64(), + * get_object(), get_array(), get_raw_json_string(), or get_string() instead. + * + * @returns A value of the given type, parsed from the JSON. + * @returns INCORRECT_TYPE If the JSON value is not the given type. + */ + template simdjson_inline simdjson_result get() noexcept { + // Unless the simdjson library or the user provides an inline implementation, calling this method should + // immediately fail. + static_assert(!sizeof(T), "The get method with given type is not implemented by the simdjson library. " + "The supported types are ondemand::object, ondemand::array, raw_json_string, std::string_view, uint64_t, " + "int64_t, double, and bool. We recommend you use get_double(), get_bool(), get_uint64(), get_int64(), " + " get_object(), get_array(), get_raw_json_string(), or get_string() instead of the get template." + " You may also add support for custom types, see our documentation."); + } + + /** + * Get this value as the given type. + * + * Supported types: object, array, raw_json_string, string_view, uint64_t, int64_t, double, bool + * + * @param out This is set to a value of the given type, parsed from the JSON. If there is an error, this may not be initialized. + * @returns INCORRECT_TYPE If the JSON value is not an object. + * @returns SUCCESS If the parse succeeded and the out parameter was set to the value. + */ + template simdjson_inline error_code get(T &out) noexcept; + + /** + * Cast this JSON value to an array. + * + * @returns An object that can be used to iterate the array. + * @returns INCORRECT_TYPE If the JSON value is not an array. + */ + simdjson_inline simdjson_result get_array() noexcept; + + /** + * Cast this JSON value to an object. + * + * @returns An object that can be used to look up or iterate fields. + * @returns INCORRECT_TYPE If the JSON value is not an object. + */ + simdjson_inline simdjson_result get_object() noexcept; + + /** + * Cast this JSON value to an unsigned integer. + * + * @returns A unsigned 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer. + */ + simdjson_inline simdjson_result get_uint64() noexcept; + + /** + * Cast this JSON value (inside string) to a unsigned integer. + * + * @returns A unsigned 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer. + */ + simdjson_inline simdjson_result get_uint64_in_string() noexcept; + + /** + * Cast this JSON value to a signed integer. + * + * @returns A signed 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit integer. + */ + simdjson_inline simdjson_result get_int64() noexcept; + + /** + * Cast this JSON value (inside string) to a signed integer. + * + * @returns A signed 64-bit integer. + * @returns INCORRECT_TYPE If the JSON value is not a 64-bit integer. + */ + simdjson_inline simdjson_result get_int64_in_string() noexcept; + + /** + * Cast this JSON value to a double. + * + * @returns A double. + * @returns INCORRECT_TYPE If the JSON value is not a valid floating-point number. + */ + simdjson_inline simdjson_result get_double() noexcept; + + /** + * Cast this JSON value (inside string) to a double + * + * @returns A double. + * @returns INCORRECT_TYPE If the JSON value is not a valid floating-point number. + */ + simdjson_inline simdjson_result get_double_in_string() noexcept; + + /** + * Cast this JSON value to a string. + * + * The string is guaranteed to be valid UTF-8. + * + * Equivalent to get(). + * + * Important: a value should be consumed once. Calling get_string() twice on the same value + * is an error. + * + * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next + * time it parses a document or when it is destroyed. + * @returns INCORRECT_TYPE if the JSON value is not a string. + */ + simdjson_inline simdjson_result get_string(bool allow_replacement = false) noexcept; + + /** + * Attempts to fill the provided std::string reference with the parsed value of the current string. + * + * The string is guaranteed to be valid UTF-8. + * + * Important: a value should be consumed once. Calling get_string() twice on the same value + * is an error. + * + * Performance: This method may be slower than get_string() or get_string(bool) because it may need to allocate memory. + * We recommend you avoid allocating an std::string unless you need to. + * + * @returns INCORRECT_TYPE if the JSON value is not a string. Otherwise, we return SUCCESS. + */ + template + simdjson_inline error_code get_string(string_type& receiver, bool allow_replacement = false) noexcept; + + /** + * Cast this JSON value to a "wobbly" string. + * + * The string is may not be a valid UTF-8 string. + * See https://simonsapin.github.io/wtf-8/ + * + * Important: a value should be consumed once. Calling get_wobbly_string() twice on the same value + * is an error. + * + * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next + * time it parses a document or when it is destroyed. + * @returns INCORRECT_TYPE if the JSON value is not a string. + */ + simdjson_inline simdjson_result get_wobbly_string() noexcept; + /** + * Cast this JSON value to a raw_json_string. + * + * The string is guaranteed to be valid UTF-8, and may have escapes in it (e.g. \\ or \n). + * + * @returns A pointer to the raw JSON for the given string. + * @returns INCORRECT_TYPE if the JSON value is not a string. + */ + simdjson_inline simdjson_result get_raw_json_string() noexcept; + + /** + * Cast this JSON value to a bool. + * + * @returns A bool value. + * @returns INCORRECT_TYPE if the JSON value is not true or false. + */ + simdjson_inline simdjson_result get_bool() noexcept; + + /** + * Checks if this JSON value is null. If and only if the value is + * null, then it is consumed (we advance). If we find a token that + * begins with 'n' but is not 'null', then an error is returned. + * + * @returns Whether the value is null. + * @returns INCORRECT_TYPE If the JSON value begins with 'n' and is not 'null'. + */ + simdjson_inline simdjson_result is_null() noexcept; + +#if SIMDJSON_EXCEPTIONS + /** + * Cast this JSON value to an instance of type T. The programmer is responsible for + * providing an implementation of get for the type T, if T is not one of the types + * supported by the library (object, array, raw_json_string, string_view, uint64_t, etc.). + * + * See https://github.com/simdjson/simdjson/blob/master/doc/basics.md#adding-support-for-custom-types + * + * @returns An instance of type T + */ + template + explicit simdjson_inline operator T() noexcept(false); + /** + * Cast this JSON value to an array. + * + * @returns An object that can be used to iterate the array. + * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not an array. + */ + simdjson_inline operator array() noexcept(false); + /** + * Cast this JSON value to an object. + * + * @returns An object that can be used to look up or iterate fields. + * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not an object. + */ + simdjson_inline operator object() noexcept(false); + /** + * Cast this JSON value to an unsigned integer. + * + * @returns A signed 64-bit integer. + * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a 64-bit unsigned integer. + */ + simdjson_inline operator uint64_t() noexcept(false); + /** + * Cast this JSON value to a signed integer. + * + * @returns A signed 64-bit integer. + * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a 64-bit integer. + */ + simdjson_inline operator int64_t() noexcept(false); + /** + * Cast this JSON value to a double. + * + * @returns A double. + * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a valid floating-point number. + */ + simdjson_inline operator double() noexcept(false); + /** + * Cast this JSON value to a string. + * + * The string is guaranteed to be valid UTF-8. + * + * Equivalent to get(). + * + * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next + * time it parses a document or when it is destroyed. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not a string. + */ + simdjson_inline operator std::string_view() noexcept(false); + /** + * Cast this JSON value to a raw_json_string. + * + * The string is guaranteed to be valid UTF-8, and may have escapes in it (e.g. \\ or \n). + * + * @returns A pointer to the raw JSON for the given string. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not a string. + */ + simdjson_inline operator raw_json_string() noexcept(false); + /** + * Cast this JSON value to a bool. + * + * @returns A bool value. + * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not true or false. + */ + simdjson_inline operator bool() noexcept(false); +#endif + + /** + * Begin array iteration. + * + * Part of the std::iterable interface. + * + * @returns INCORRECT_TYPE If the JSON value is not an array. + */ + simdjson_inline simdjson_result begin() & noexcept; + /** + * Sentinel representing the end of the array. + * + * Part of the std::iterable interface. + */ + simdjson_inline simdjson_result end() & noexcept; + /** + * This method scans the array and counts the number of elements. + * The count_elements method should always be called before you have begun + * iterating through the array: it is expected that you are pointing at + * the beginning of the array. + * The runtime complexity is linear in the size of the array. After + * calling this function, if successful, the array is 'rewinded' at its + * beginning as if it had never been accessed. If the JSON is malformed (e.g., + * there is a missing comma), then an error is returned and it is no longer + * safe to continue. + * + * Performance hint: You should only call count_elements() as a last + * resort as it may require scanning the document twice or more. + */ + simdjson_inline simdjson_result count_elements() & noexcept; + /** + * This method scans the object and counts the number of key-value pairs. + * The count_fields method should always be called before you have begun + * iterating through the object: it is expected that you are pointing at + * the beginning of the object. + * The runtime complexity is linear in the size of the object. After + * calling this function, if successful, the object is 'rewinded' at its + * beginning as if it had never been accessed. If the JSON is malformed (e.g., + * there is a missing comma), then an error is returned and it is no longer + * safe to continue. + * + * To check that an object is empty, it is more performant to use + * the is_empty() method on the object instance. + * + * Performance hint: You should only call count_fields() as a last + * resort as it may require scanning the document twice or more. + */ + simdjson_inline simdjson_result count_fields() & noexcept; + /** + * Get the value at the given index in the array. This function has linear-time complexity. + * This function should only be called once on an array instance since the array iterator is not reset between each call. + * + * @return The value at the given index, or: + * - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length + */ + simdjson_inline simdjson_result at(size_t index) noexcept; + /** + * Look up a field by name on an object (order-sensitive). + * + * The following code reads z, then y, then x, and thus will not retrieve x or y if fed the + * JSON `{ "x": 1, "y": 2, "z": 3 }`: + * + * ```c++ + * simdjson::ondemand::parser parser; + * auto obj = parser.parse(R"( { "x": 1, "y": 2, "z": 3 } )"_padded); + * double z = obj.find_field("z"); + * double y = obj.find_field("y"); + * double x = obj.find_field("x"); + * ``` + * If you have multiple fields with a matching key ({"x": 1, "x": 1}) be mindful + * that only one field is returned. + + * **Raw Keys:** The lookup will be done against the *raw* key, and will not unescape keys. + * e.g. `object["a"]` will match `{ "a": 1 }`, but will *not* match `{ "\u0061": 1 }`. + * + * @param key The key to look up. + * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object. + */ + simdjson_inline simdjson_result find_field(std::string_view key) noexcept; + /** @overload simdjson_inline simdjson_result find_field(std::string_view key) noexcept; */ + simdjson_inline simdjson_result find_field(const char *key) noexcept; + + /** + * Look up a field by name on an object, without regard to key order. + * + * **Performance Notes:** This is a bit less performant than find_field(), though its effect varies + * and often appears negligible. It starts out normally, starting out at the last field; but if + * the field is not found, it scans from the beginning of the object to see if it missed it. That + * missing case has a non-cache-friendly bump and lots of extra scanning, especially if the object + * in question is large. The fact that the extra code is there also bumps the executable size. + * + * It is the default, however, because it would be highly surprising (and hard to debug) if the + * default behavior failed to look up a field just because it was in the wrong order--and many + * APIs assume this. Therefore, you must be explicit if you want to treat objects as out of order. + * + * If you have multiple fields with a matching key ({"x": 1, "x": 1}) be mindful + * that only one field is returned. + * + * Use find_field() if you are sure fields will be in order (or are willing to treat it as if the + * field as not there when they are not in order). + * + * @param key The key to look up. + * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object. + */ + simdjson_inline simdjson_result find_field_unordered(std::string_view key) noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) noexcept; */ + simdjson_inline simdjson_result find_field_unordered(const char *key) noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) noexcept; */ + simdjson_inline simdjson_result operator[](std::string_view key) noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) noexcept; */ + simdjson_inline simdjson_result operator[](const char *key) noexcept; + + /** + * Get the type of this JSON value. It does not validate or consume the value. + * E.g., you must still call "is_null()" to check that a value is null even if + * "type()" returns json_type::null. + * + * NOTE: If you're only expecting a value to be one type (a typical case), it's generally + * better to just call .get_double, .get_string, etc. and check for INCORRECT_TYPE (or just + * let it throw an exception). + * + * @return The type of JSON value (json_type::array, json_type::object, json_type::string, + * json_type::number, json_type::boolean, or json_type::null). + * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse". + */ + simdjson_inline simdjson_result type() noexcept; + + /** + * Checks whether the value is a scalar (string, number, null, Boolean). + * Returns false when there it is an array or object. + * + * @returns true if the type is string, number, null, Boolean + * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse". + */ + simdjson_inline simdjson_result is_scalar() noexcept; + /** + * Checks whether the value is a string. + * + * @returns true if the type is string + * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse". + */ + simdjson_inline simdjson_result is_string() noexcept; + + /** + * Checks whether the value is a negative number. + * + * @returns true if the number if negative. + */ + simdjson_inline bool is_negative() noexcept; + /** + * Checks whether the value is an integer number. Note that + * this requires to partially parse the number string. If + * the value is determined to be an integer, it may still + * not parse properly as an integer in subsequent steps + * (e.g., it might overflow). + * + * Performance note: if you call this function systematically + * before parsing a number, you may have fallen for a performance + * anti-pattern. + * + * @returns true if the number if negative. + */ + simdjson_inline simdjson_result is_integer() noexcept; + /** + * Determine the number type (integer or floating-point number) as quickly + * as possible. This function does not fully validate the input. It is + * useful when you only need to classify the numbers, without parsing them. + * + * If you are planning to retrieve the value or you need full validation, + * consider using the get_number() method instead: it will fully parse + * and validate the input, and give you access to the type: + * get_number().get_number_type(). + * + * get_number_type() is number_type::unsigned_integer if we have + * an integer greater or equal to 9223372036854775808. + * get_number_type() is number_type::signed_integer if we have an + * integer that is less than 9223372036854775808. + * get_number_type() is number_type::big_integer for integers that do not fit in 64 bits, + * in which case the digit_count is set to the length of the big integer string. + * Otherwise, get_number_type() has value number_type::floating_point_number. + * + * This function requires processing the number string, but it is expected + * to be faster than get_number().get_number_type() because it is does not + * parse the number value. + * + * @returns the type of the number + */ + simdjson_inline simdjson_result get_number_type() noexcept; + + /** + * Attempt to parse an ondemand::number. An ondemand::number may + * contain an integer value or a floating-point value, the simdjson + * library will autodetect the type. Thus it is a dynamically typed + * number. Before accessing the value, you must determine the detected + * type. + * + * number.get_number_type() is number_type::signed_integer if we have + * an integer in [-9223372036854775808,9223372036854775808) + * You can recover the value by calling number.get_int64() and you + * have that number.is_int64() is true. + * + * number.get_number_type() is number_type::unsigned_integer if we have + * an integer in [9223372036854775808,18446744073709551616) + * You can recover the value by calling number.get_uint64() and you + * have that number.is_uint64() is true. + * + * For integers that do not fit in 64 bits, the function returns BIGINT_ERROR error code. + * + * Otherwise, number.get_number_type() has value number_type::floating_point_number + * and we have a binary64 number. + * You can recover the value by calling number.get_double() and you + * have that number.is_double() is true. + * + * You must check the type before accessing the value: it is an error + * to call "get_int64()" when number.get_number_type() is not + * number_type::signed_integer and when number.is_int64() is false. + * + * Performance note: this is designed with performance in mind. When + * calling 'get_number()', you scan the number string only once, determining + * efficiently the type and storing it in an efficient manner. + */ + simdjson_warn_unused simdjson_inline simdjson_result get_number() noexcept; + + /** + * Get the raw JSON for this token. + * + * The string_view will always point into the input buffer. + * + * The string_view will start at the beginning of the token, and include the entire token + * *as well as all spaces until the next token (or EOF).* This means, for example, that a + * string token always begins with a " and is always terminated by the final ", possibly + * followed by a number of spaces. + * + * The string_view is *not* null-terminated. However, if this is a scalar (string, number, + * boolean, or null), the character after the end of the string_view is guaranteed to be + * a non-space token. + * + * Tokens include: + * - { + * - [ + * - "a string (possibly with UTF-8 or backslashed characters like \\\")". + * - -1.2e-100 + * - true + * - false + * - null + * + * See also value::raw_json(). + */ + simdjson_inline std::string_view raw_json_token() noexcept; + + /** + * Get a string_view pointing at this value in the JSON document. + * If this element is an array or an object, it consumes the array or the object + * and returns a string_view instance corresponding to the + * array as represented in JSON. It points inside the original document. + * If this element is a scalar (string, number, Boolean, null), it returns what + * raw_json_token() would return. + */ + simdjson_inline simdjson_result raw_json() noexcept; + + /** + * Returns the current location in the document if in bounds. + */ + simdjson_inline simdjson_result current_location() noexcept; + + /** + * Returns the current depth in the document if in bounds. + * + * E.g., + * 0 = finished with document + * 1 = document root value (could be [ or {, not yet known) + * 2 = , or } inside root array/object + * 3 = key or value inside root array/object. + */ + simdjson_inline int32_t current_depth() const noexcept; + + /** + * Get the value associated with the given JSON pointer. We use the RFC 6901 + * https://tools.ietf.org/html/rfc6901 standard. + * + * ondemand::parser parser; + * auto json = R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("/foo/a/1") == 20 + * + * It is allowed for a key to be the empty string: + * + * ondemand::parser parser; + * auto json = R"({ "": { "a": [ 10, 20, 30 ] }})"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("//a/1") == 20 + * + * Note that at_pointer() called on the document automatically calls the document's rewind + * method between each call. It invalidates all previously accessed arrays, objects and values + * that have not been consumed. + * + * Calling at_pointer() on non-document instances (e.g., arrays and objects) is not + * standardized (by RFC 6901). We provide some experimental support for JSON pointers + * on non-document instances. Yet it is not the case when calling at_pointer on an array + * or an object instance: there is no rewind and no invalidation. + * + * You may only call at_pointer on an array after it has been created, but before it has + * been first accessed. When calling at_pointer on an array, the pointer is advanced to + * the location indicated by the JSON pointer (in case of success). It is no longer possible + * to call at_pointer on the same array. + * + * You may call at_pointer more than once on an object, but each time the pointer is advanced + * to be within the value matched by the key indicated by the JSON pointer query. Thus any preceding + * key (as well as the current key) can no longer be used with following JSON pointer calls. + * + * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching + * + * @return The value associated with the given JSON pointer, or: + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + * - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed + */ + simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + + /** + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key + * names and array indices. + * + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails + * - NO_SUCH_FIELD if a field does not exist in an object + * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length + * - INCORRECT_TYPE if a non-integer is used to access an array + */ + simdjson_inline simdjson_result at_path(std::string_view at_path) noexcept; + + +protected: + /** + * Create a value. + */ + simdjson_inline value(const value_iterator &iter) noexcept; + + /** + * Skip this value, allowing iteration to continue. + */ + simdjson_inline void skip() noexcept; + + /** + * Start a value at the current position. + * + * (It should already be started; this is just a self-documentation method.) + */ + static simdjson_inline value start(const value_iterator &iter) noexcept; + + /** + * Resume a value. + */ + static simdjson_inline value resume(const value_iterator &iter) noexcept; + + /** + * Get the object, starting or resuming it as necessary + */ + simdjson_inline simdjson_result start_or_resume_object() noexcept; + + // simdjson_inline void log_value(const char *type) const noexcept; + // simdjson_inline void log_error(const char *message) const noexcept; + + value_iterator iter{}; + + friend class document; + friend class array_iterator; + friend class field; + friend class object; + friend struct simdjson_result; + friend struct simdjson_result; + friend class field; +}; + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::value &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; + + simdjson_inline simdjson_result get_array() noexcept; + simdjson_inline simdjson_result get_object() noexcept; + + simdjson_inline simdjson_result get_uint64() noexcept; + simdjson_inline simdjson_result get_uint64_in_string() noexcept; + simdjson_inline simdjson_result get_int64() noexcept; + simdjson_inline simdjson_result get_int64_in_string() noexcept; + simdjson_inline simdjson_result get_double() noexcept; + simdjson_inline simdjson_result get_double_in_string() noexcept; + simdjson_inline simdjson_result get_string(bool allow_replacement = false) noexcept; + template + simdjson_inline error_code get_string(string_type& receiver, bool allow_replacement = false) noexcept; + simdjson_inline simdjson_result get_wobbly_string() noexcept; + simdjson_inline simdjson_result get_raw_json_string() noexcept; + simdjson_inline simdjson_result get_bool() noexcept; + simdjson_inline simdjson_result is_null() noexcept; + + template simdjson_inline simdjson_result get() noexcept; + + template simdjson_inline error_code get(T &out) noexcept; + +#if SIMDJSON_EXCEPTIONS + template + explicit simdjson_inline operator T() noexcept(false); + simdjson_inline operator SIMDJSON_IMPLEMENTATION::ondemand::array() noexcept(false); + simdjson_inline operator SIMDJSON_IMPLEMENTATION::ondemand::object() noexcept(false); + simdjson_inline operator uint64_t() noexcept(false); + simdjson_inline operator int64_t() noexcept(false); + simdjson_inline operator double() noexcept(false); + simdjson_inline operator std::string_view() noexcept(false); + simdjson_inline operator SIMDJSON_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false); + simdjson_inline operator bool() noexcept(false); +#endif + simdjson_inline simdjson_result count_elements() & noexcept; + simdjson_inline simdjson_result count_fields() & noexcept; + simdjson_inline simdjson_result at(size_t index) noexcept; + simdjson_inline simdjson_result begin() & noexcept; + simdjson_inline simdjson_result end() & noexcept; + + /** + * Look up a field by name on an object (order-sensitive). + * + * The following code reads z, then y, then x, and thus will not retrieve x or y if fed the + * JSON `{ "x": 1, "y": 2, "z": 3 }`: + * + * ```c++ + * simdjson::ondemand::parser parser; + * auto obj = parser.parse(R"( { "x": 1, "y": 2, "z": 3 } )"_padded); + * double z = obj.find_field("z"); + * double y = obj.find_field("y"); + * double x = obj.find_field("x"); + * ``` + * + * **Raw Keys:** The lookup will be done against the *raw* key, and will not unescape keys. + * e.g. `object["a"]` will match `{ "a": 1 }`, but will *not* match `{ "\u0061": 1 }`. + * + * @param key The key to look up. + * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object. + */ + simdjson_inline simdjson_result find_field(std::string_view key) noexcept; + /** @overload simdjson_inline simdjson_result find_field(std::string_view key) noexcept; */ + simdjson_inline simdjson_result find_field(const char *key) noexcept; + + /** + * Look up a field by name on an object, without regard to key order. + * + * **Performance Notes:** This is a bit less performant than find_field(), though its effect varies + * and often appears negligible. It starts out normally, starting out at the last field; but if + * the field is not found, it scans from the beginning of the object to see if it missed it. That + * missing case has a non-cache-friendly bump and lots of extra scanning, especially if the object + * in question is large. The fact that the extra code is there also bumps the executable size. + * + * It is the default, however, because it would be highly surprising (and hard to debug) if the + * default behavior failed to look up a field just because it was in the wrong order--and many + * APIs assume this. Therefore, you must be explicit if you want to treat objects as out of order. + * + * Use find_field() if you are sure fields will be in order (or are willing to treat it as if the + * field as not there when they are not in order). + * + * @param key The key to look up. + * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object. + */ + simdjson_inline simdjson_result find_field_unordered(std::string_view key) noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) noexcept; */ + simdjson_inline simdjson_result find_field_unordered(const char *key) noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) noexcept; */ + simdjson_inline simdjson_result operator[](std::string_view key) noexcept; + /** @overload simdjson_inline simdjson_result find_field_unordered(std::string_view key) noexcept; */ + simdjson_inline simdjson_result operator[](const char *key) noexcept; + + /** + * Get the type of this JSON value. + * + * NOTE: If you're only expecting a value to be one type (a typical case), it's generally + * better to just call .get_double, .get_string, etc. and check for INCORRECT_TYPE (or just + * let it throw an exception). + */ + simdjson_inline simdjson_result type() noexcept; + simdjson_inline simdjson_result is_scalar() noexcept; + simdjson_inline simdjson_result is_string() noexcept; + simdjson_inline simdjson_result is_negative() noexcept; + simdjson_inline simdjson_result is_integer() noexcept; + simdjson_inline simdjson_result get_number_type() noexcept; + simdjson_inline simdjson_result get_number() noexcept; + + /** @copydoc simdjson_inline std::string_view value::raw_json_token() const noexcept */ + simdjson_inline simdjson_result raw_json_token() noexcept; + simdjson_inline simdjson_result raw_json() noexcept; + + /** @copydoc simdjson_inline simdjson_result current_location() noexcept */ + simdjson_inline simdjson_result current_location() noexcept; + /** @copydoc simdjson_inline int32_t current_depth() const noexcept */ + simdjson_inline simdjson_result current_depth() const noexcept; + simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; + simdjson_inline simdjson_result at_path(std::string_view json_path) noexcept; +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_VALUE_H diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/value_iterator-inl.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/value_iterator-inl.h new file mode 100644 index 000000000000..e67bddaeb14c --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/value_iterator-inl.h @@ -0,0 +1,1091 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_VALUE_ITERATOR_INL_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_VALUE_ITERATOR_INL_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/atomparsing.h" +#include "simdjson/generic/numberparsing.h" +#include "simdjson/generic/ondemand/json_iterator.h" +#include "simdjson/generic/ondemand/value_iterator.h" +#include "simdjson/generic/ondemand/json_type-inl.h" +#include "simdjson/generic/ondemand/raw_json_string-inl.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +simdjson_inline value_iterator::value_iterator( + json_iterator *json_iter, + depth_t depth, + token_position start_position +) noexcept : _json_iter{json_iter}, _depth{depth}, _start_position{start_position} +{ +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::start_object() noexcept { + SIMDJSON_TRY( start_container('{', "Not an object", "object") ); + return started_object(); +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::start_root_object() noexcept { + SIMDJSON_TRY( start_container('{', "Not an object", "object") ); + return started_root_object(); +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::started_object() noexcept { + assert_at_container_start(); +#if SIMDJSON_DEVELOPMENT_CHECKS + _json_iter->set_start_position(_depth, start_position()); +#endif + if (*_json_iter->peek() == '}') { + logger::log_value(*_json_iter, "empty object"); + _json_iter->return_current_and_advance(); + end_container(); + return false; + } + return true; +} + +simdjson_warn_unused simdjson_inline error_code value_iterator::check_root_object() noexcept { + // When in streaming mode, we cannot expect peek_last() to be the last structural element of the + // current document. It only works in the normal mode where we have indexed a single document. + // Note that adding a check for 'streaming' is not expensive since we only have at most + // one root element. + if ( ! _json_iter->streaming() ) { + // The following lines do not fully protect against garbage content within the + // object: e.g., `{"a":2} foo }`. Users concerned with garbage content should + // call `at_end()` on the document instance at the end of the processing to + // ensure that the processing has finished at the end. + // + if (*_json_iter->peek_last() != '}') { + _json_iter->abandon(); + return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing } at end"); + } + // If the last character is } *and* the first gibberish character is also '}' + // then on-demand could accidentally go over. So we need additional checks. + // https://github.com/simdjson/simdjson/issues/1834 + // Checking that the document is balanced requires a full scan which is potentially + // expensive, but it only happens in edge cases where the first padding character is + // a closing bracket. + if ((*_json_iter->peek(_json_iter->end_position()) == '}') && (!_json_iter->balanced())) { + _json_iter->abandon(); + // The exact error would require more work. It will typically be an unclosed object. + return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "the document is unbalanced"); + } + } + return SUCCESS; +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::started_root_object() noexcept { + auto error = check_root_object(); + if(error) { return error; } + return started_object(); +} + +simdjson_warn_unused simdjson_inline error_code value_iterator::end_container() noexcept { +#if SIMDJSON_CHECK_EOF + if (depth() > 1 && at_end()) { return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing parent ] or }"); } + // if (depth() <= 1 && !at_end()) { return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing [ or { at start"); } +#endif // SIMDJSON_CHECK_EOF + _json_iter->ascend_to(depth()-1); + return SUCCESS; +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::has_next_field() noexcept { + assert_at_next(); + + // It's illegal to call this unless there are more tokens: anything that ends in } or ] is + // obligated to verify there are more tokens if they are not the top level. + switch (*_json_iter->return_current_and_advance()) { + case '}': + logger::log_end_value(*_json_iter, "object"); + SIMDJSON_TRY( end_container() ); + return false; + case ',': + return true; + default: + return report_error(TAPE_ERROR, "Missing comma between object fields"); + } +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::find_field_raw(const std::string_view key) noexcept { + error_code error; + bool has_value; + // + // Initially, the object can be in one of a few different places: + // + // 1. The start of the object, at the first field: + // + // ``` + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 2, index 1) + // ``` + if (at_first_field()) { + has_value = true; + + // + // 2. When a previous search did not yield a value or the object is empty: + // + // ``` + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 0) + // { } + // ^ (depth 0, index 2) + // ``` + // + } else if (!is_open()) { +#if SIMDJSON_DEVELOPMENT_CHECKS + // If we're past the end of the object, we're being iterated out of order. + // Note: this is not perfect detection. It's possible the user is inside some other object; if so, + // this object iterator will blithely scan that object for fields. + if (_json_iter->depth() < depth() - 1) { return OUT_OF_ORDER_ITERATION; } +#endif + return false; + + // 3. When a previous search found a field or an iterator yielded a value: + // + // ``` + // // When a field was not fully consumed (or not even touched at all) + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 2) + // // When a field was fully consumed + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 1) + // // When the last field was fully consumed + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 1) + // ``` + // + } else { + if ((error = skip_child() )) { abandon(); return error; } + if ((error = has_next_field().get(has_value) )) { abandon(); return error; } +#if SIMDJSON_DEVELOPMENT_CHECKS + if (_json_iter->start_position(_depth) != start_position()) { return OUT_OF_ORDER_ITERATION; } +#endif + } + while (has_value) { + // Get the key and colon, stopping at the value. + raw_json_string actual_key; + // size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes + // Note: _json_iter->peek_length() - 2 might overflow if _json_iter->peek_length() < 2. + // field_key() advances the pointer and checks that '"' is found (corresponding to a key). + // The depth is left unchanged by field_key(). + if ((error = field_key().get(actual_key) )) { abandon(); return error; }; + // field_value() will advance and check that we find a ':' separating the + // key and the value. It will also increment the depth by one. + if ((error = field_value() )) { abandon(); return error; } + // If it matches, stop and return + // We could do it this way if we wanted to allow arbitrary + // key content (including escaped quotes). + //if (actual_key.unsafe_is_equal(max_key_length, key)) { + // Instead we do the following which may trigger buffer overruns if the + // user provides an adversarial key (containing a well placed unescaped quote + // character and being longer than the number of bytes remaining in the JSON + // input). + if (actual_key.unsafe_is_equal(key)) { + logger::log_event(*this, "match", key, -2); + // If we return here, then we return while pointing at the ':' that we just checked. + return true; + } + + // No match: skip the value and see if , or } is next + logger::log_event(*this, "no match", key, -2); + // The call to skip_child is meant to skip over the value corresponding to the key. + // After skip_child(), we are right before the next comma (',') or the final brace ('}'). + SIMDJSON_TRY( skip_child() ); // Skip the value entirely + // The has_next_field() advances the pointer and check that either ',' or '}' is found. + // It returns true if ',' is found, false otherwise. If anything other than ',' or '}' is found, + // then we are in error and we abort. + if ((error = has_next_field().get(has_value) )) { abandon(); return error; } + } + + // If the loop ended, we're out of fields to look at. + return false; +} + +SIMDJSON_PUSH_DISABLE_WARNINGS +SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::find_field_unordered_raw(const std::string_view key) noexcept { + /** + * When find_field_unordered_raw is called, we can either be pointing at the + * first key, pointing outside (at the closing brace) or if a key was matched + * we can be either pointing right afterthe ':' right before the value (that we need skip), + * or we may have consumed the value and we might be at a comma or at the + * final brace (ready for a call to has_next_field()). + */ + error_code error; + bool has_value; + + // First, we scan from that point to the end. + // If we don't find a match, we may loop back around, and scan from the beginning to that point. + token_position search_start = _json_iter->position(); + + // We want to know whether we need to go back to the beginning. + bool at_first = at_first_field(); + /////////////// + // Initially, the object can be in one of a few different places: + // + // 1. At the first key: + // + // ``` + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 2, index 1) + // ``` + // + if (at_first) { + has_value = true; + + // 2. When a previous search did not yield a value or the object is empty: + // + // ``` + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 0) + // { } + // ^ (depth 0, index 2) + // ``` + // + } else if (!is_open()) { + +#if SIMDJSON_DEVELOPMENT_CHECKS + // If we're past the end of the object, we're being iterated out of order. + // Note: this is not perfect detection. It's possible the user is inside some other object; if so, + // this object iterator will blithely scan that object for fields. + if (_json_iter->depth() < depth() - 1) { return OUT_OF_ORDER_ITERATION; } +#endif + SIMDJSON_TRY(reset_object().get(has_value)); + at_first = true; + // 3. When a previous search found a field or an iterator yielded a value: + // + // ``` + // // When a field was not fully consumed (or not even touched at all) + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 2) + // // When a field was fully consumed + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 1) + // // When the last field was fully consumed + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 1) + // ``` + // + } else { + // If someone queried a key but they not did access the value, then we are left pointing + // at the ':' and we need to move forward through the value... If the value was + // processed then skip_child() does not move the iterator (but may adjust the depth). + if ((error = skip_child() )) { abandon(); return error; } + search_start = _json_iter->position(); + if ((error = has_next_field().get(has_value) )) { abandon(); return error; } +#if SIMDJSON_DEVELOPMENT_CHECKS + if (_json_iter->start_position(_depth) != start_position()) { return OUT_OF_ORDER_ITERATION; } +#endif + } + + // After initial processing, we will be in one of two states: + // + // ``` + // // At the beginning of a field + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 1) + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 1) + // // At the end of the object + // { "a": [ 1, 2 ], "b": [ 3, 4 ] } + // ^ (depth 0) + // ``` + // + // Next, we find a match starting from the current position. + while (has_value) { + SIMDJSON_ASSUME( _json_iter->_depth == _depth ); // We must be at the start of a field + + // Get the key and colon, stopping at the value. + raw_json_string actual_key; + // size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes + // Note: _json_iter->peek_length() - 2 might overflow if _json_iter->peek_length() < 2. + // field_key() advances the pointer and checks that '"' is found (corresponding to a key). + // The depth is left unchanged by field_key(). + if ((error = field_key().get(actual_key) )) { abandon(); return error; }; + // field_value() will advance and check that we find a ':' separating the + // key and the value. It will also increment the depth by one. + if ((error = field_value() )) { abandon(); return error; } + + // If it matches, stop and return + // We could do it this way if we wanted to allow arbitrary + // key content (including escaped quotes). + // if (actual_key.unsafe_is_equal(max_key_length, key)) { + // Instead we do the following which may trigger buffer overruns if the + // user provides an adversarial key (containing a well placed unescaped quote + // character and being longer than the number of bytes remaining in the JSON + // input). + if (actual_key.unsafe_is_equal(key)) { + logger::log_event(*this, "match", key, -2); + // If we return here, then we return while pointing at the ':' that we just checked. + return true; + } + + // No match: skip the value and see if , or } is next + logger::log_event(*this, "no match", key, -2); + // The call to skip_child is meant to skip over the value corresponding to the key. + // After skip_child(), we are right before the next comma (',') or the final brace ('}'). + SIMDJSON_TRY( skip_child() ); + // The has_next_field() advances the pointer and check that either ',' or '}' is found. + // It returns true if ',' is found, false otherwise. If anything other than ',' or '}' is found, + // then we are in error and we abort. + if ((error = has_next_field().get(has_value) )) { abandon(); return error; } + } + // Performance note: it maybe wasteful to rewind to the beginning when there might be + // no other query following. Indeed, it would require reskipping the whole object. + // Instead, you can just stay where you are. If there is a new query, there is always time + // to rewind. + if(at_first) { return false; } + + // If we reach the end without finding a match, search the rest of the fields starting at the + // beginning of the object. + // (We have already run through the object before, so we've already validated its structure. We + // don't check errors in this bit.) + SIMDJSON_TRY(reset_object().get(has_value)); + while (true) { + SIMDJSON_ASSUME(has_value); // we should reach search_start before ever reaching the end of the object + SIMDJSON_ASSUME( _json_iter->_depth == _depth ); // We must be at the start of a field + + // Get the key and colon, stopping at the value. + raw_json_string actual_key; + // size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes + // Note: _json_iter->peek_length() - 2 might overflow if _json_iter->peek_length() < 2. + // field_key() advances the pointer and checks that '"' is found (corresponding to a key). + // The depth is left unchanged by field_key(). + error = field_key().get(actual_key); SIMDJSON_ASSUME(!error); + // field_value() will advance and check that we find a ':' separating the + // key and the value. It will also increment the depth by one. + error = field_value(); SIMDJSON_ASSUME(!error); + + // If it matches, stop and return + // We could do it this way if we wanted to allow arbitrary + // key content (including escaped quotes). + // if (actual_key.unsafe_is_equal(max_key_length, key)) { + // Instead we do the following which may trigger buffer overruns if the + // user provides an adversarial key (containing a well placed unescaped quote + // character and being longer than the number of bytes remaining in the JSON + // input). + if (actual_key.unsafe_is_equal(key)) { + logger::log_event(*this, "match", key, -2); + // If we return here, then we return while pointing at the ':' that we just checked. + return true; + } + + // No match: skip the value and see if , or } is next + logger::log_event(*this, "no match", key, -2); + // The call to skip_child is meant to skip over the value corresponding to the key. + // After skip_child(), we are right before the next comma (',') or the final brace ('}'). + SIMDJSON_TRY( skip_child() ); + // If we reached the end of the key-value pair we started from, then we know + // that the key is not there so we return false. We are either right before + // the next comma or the final brace. + if(_json_iter->position() == search_start) { return false; } + // The has_next_field() advances the pointer and check that either ',' or '}' is found. + // It returns true if ',' is found, false otherwise. If anything other than ',' or '}' is found, + // then we are in error and we abort. + error = has_next_field().get(has_value); SIMDJSON_ASSUME(!error); + // If we make the mistake of exiting here, then we could be left pointing at a key + // in the middle of an object. That's not an allowable state. + } + // If the loop ended, we're out of fields to look at. The program should + // never reach this point. + return false; +} +SIMDJSON_POP_DISABLE_WARNINGS + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::field_key() noexcept { + assert_at_next(); + + const uint8_t *key = _json_iter->return_current_and_advance(); + if (*(key++) != '"') { return report_error(TAPE_ERROR, "Object key is not a string"); } + return raw_json_string(key); +} + +simdjson_warn_unused simdjson_inline error_code value_iterator::field_value() noexcept { + assert_at_next(); + + if (*_json_iter->return_current_and_advance() != ':') { return report_error(TAPE_ERROR, "Missing colon in object field"); } + _json_iter->descend_to(depth()+1); + return SUCCESS; +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::start_array() noexcept { + SIMDJSON_TRY( start_container('[', "Not an array", "array") ); + return started_array(); +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::start_root_array() noexcept { + SIMDJSON_TRY( start_container('[', "Not an array", "array") ); + return started_root_array(); +} + +inline std::string value_iterator::to_string() const noexcept { + auto answer = std::string("value_iterator [ depth : ") + std::to_string(_depth) + std::string(", "); + if(_json_iter != nullptr) { answer += _json_iter->to_string(); } + answer += std::string(" ]"); + return answer; +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::started_array() noexcept { + assert_at_container_start(); + if (*_json_iter->peek() == ']') { + logger::log_value(*_json_iter, "empty array"); + _json_iter->return_current_and_advance(); + SIMDJSON_TRY( end_container() ); + return false; + } + _json_iter->descend_to(depth()+1); +#if SIMDJSON_DEVELOPMENT_CHECKS + _json_iter->set_start_position(_depth, start_position()); +#endif + return true; +} + +simdjson_warn_unused simdjson_inline error_code value_iterator::check_root_array() noexcept { + // When in streaming mode, we cannot expect peek_last() to be the last structural element of the + // current document. It only works in the normal mode where we have indexed a single document. + // Note that adding a check for 'streaming' is not expensive since we only have at most + // one root element. + if ( ! _json_iter->streaming() ) { + // The following lines do not fully protect against garbage content within the + // array: e.g., `[1, 2] foo]`. Users concerned with garbage content should + // also call `at_end()` on the document instance at the end of the processing to + // ensure that the processing has finished at the end. + // + if (*_json_iter->peek_last() != ']') { + _json_iter->abandon(); + return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing ] at end"); + } + // If the last character is ] *and* the first gibberish character is also ']' + // then on-demand could accidentally go over. So we need additional checks. + // https://github.com/simdjson/simdjson/issues/1834 + // Checking that the document is balanced requires a full scan which is potentially + // expensive, but it only happens in edge cases where the first padding character is + // a closing bracket. + if ((*_json_iter->peek(_json_iter->end_position()) == ']') && (!_json_iter->balanced())) { + _json_iter->abandon(); + // The exact error would require more work. It will typically be an unclosed array. + return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "the document is unbalanced"); + } + } + return SUCCESS; +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::started_root_array() noexcept { + auto error = check_root_array(); + if (error) { return error; } + return started_array(); +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::has_next_element() noexcept { + assert_at_next(); + + logger::log_event(*this, "has_next_element"); + switch (*_json_iter->return_current_and_advance()) { + case ']': + logger::log_end_value(*_json_iter, "array"); + SIMDJSON_TRY( end_container() ); + return false; + case ',': + _json_iter->descend_to(depth()+1); + return true; + default: + return report_error(TAPE_ERROR, "Missing comma between array elements"); + } +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::parse_bool(const uint8_t *json) const noexcept { + auto not_true = atomparsing::str4ncmp(json, "true"); + auto not_false = atomparsing::str4ncmp(json, "fals") | (json[4] ^ 'e'); + bool error = (not_true && not_false) || jsoncharutils::is_not_structural_or_whitespace(json[not_true ? 5 : 4]); + if (error) { return incorrect_type_error("Not a boolean"); } + return simdjson_result(!not_true); +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::parse_null(const uint8_t *json) const noexcept { + bool is_null_string = !atomparsing::str4ncmp(json, "null") && jsoncharutils::is_structural_or_whitespace(json[4]); + // if we start with 'n', we must be a null + if(!is_null_string && json[0]=='n') { return incorrect_type_error("Not a null but starts with n"); } + return is_null_string; +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_string(bool allow_replacement) noexcept { + return get_raw_json_string().unescape(json_iter(), allow_replacement); +} +template +simdjson_warn_unused simdjson_inline error_code value_iterator::get_string(string_type& receiver, bool allow_replacement) noexcept { + std::string_view content; + auto err = get_string(allow_replacement).get(content); + if (err) { return err; } + receiver = content; + return SUCCESS; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_wobbly_string() noexcept { + return get_raw_json_string().unescape_wobbly(json_iter()); +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_raw_json_string() noexcept { + auto json = peek_scalar("string"); + if (*json != '"') { return incorrect_type_error("Not a string"); } + advance_scalar("string"); + return raw_json_string(json+1); +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_uint64() noexcept { + auto result = numberparsing::parse_unsigned(peek_non_root_scalar("uint64")); + if(result.error() == SUCCESS) { advance_non_root_scalar("uint64"); } + return result; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_uint64_in_string() noexcept { + auto result = numberparsing::parse_unsigned_in_string(peek_non_root_scalar("uint64")); + if(result.error() == SUCCESS) { advance_non_root_scalar("uint64"); } + return result; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_int64() noexcept { + auto result = numberparsing::parse_integer(peek_non_root_scalar("int64")); + if(result.error() == SUCCESS) { advance_non_root_scalar("int64"); } + return result; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_int64_in_string() noexcept { + auto result = numberparsing::parse_integer_in_string(peek_non_root_scalar("int64")); + if(result.error() == SUCCESS) { advance_non_root_scalar("int64"); } + return result; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_double() noexcept { + auto result = numberparsing::parse_double(peek_non_root_scalar("double")); + if(result.error() == SUCCESS) { advance_non_root_scalar("double"); } + return result; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_double_in_string() noexcept { + auto result = numberparsing::parse_double_in_string(peek_non_root_scalar("double")); + if(result.error() == SUCCESS) { advance_non_root_scalar("double"); } + return result; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_bool() noexcept { + auto result = parse_bool(peek_non_root_scalar("bool")); + if(result.error() == SUCCESS) { advance_non_root_scalar("bool"); } + return result; +} +simdjson_inline simdjson_result value_iterator::is_null() noexcept { + bool is_null_value; + SIMDJSON_TRY(parse_null(peek_non_root_scalar("null")).get(is_null_value)); + if(is_null_value) { advance_non_root_scalar("null"); } + return is_null_value; +} +simdjson_inline bool value_iterator::is_negative() noexcept { + return numberparsing::is_negative(peek_non_root_scalar("numbersign")); +} +simdjson_inline bool value_iterator::is_root_negative() noexcept { + return numberparsing::is_negative(peek_root_scalar("numbersign")); +} +simdjson_inline simdjson_result value_iterator::is_integer() noexcept { + return numberparsing::is_integer(peek_non_root_scalar("integer")); +} +simdjson_inline simdjson_result value_iterator::get_number_type() noexcept { + return numberparsing::get_number_type(peek_non_root_scalar("integer")); +} +simdjson_inline simdjson_result value_iterator::get_number() noexcept { + number num; + error_code error = numberparsing::parse_number(peek_non_root_scalar("number"), num); + if(error) { return error; } + return num; +} + +simdjson_inline simdjson_result value_iterator::is_root_integer(bool check_trailing) noexcept { + auto max_len = peek_root_length(); + auto json = peek_root_scalar("is_root_integer"); + uint8_t tmpbuf[20+1+1]{}; // <20 digits> is the longest possible unsigned integer + tmpbuf[20+1] = '\0'; // make sure that buffer is always null terminated. + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf, 20+1)) { + return false; // if there are more than 20 characters, it cannot be represented as an integer. + } + auto answer = numberparsing::is_integer(tmpbuf); + // If the parsing was a success, we must still check that it is + // a single scalar. Note that we parse first because of cases like '[]' where + // getting TRAILING_CONTENT is wrong. + if(check_trailing && (answer.error() == SUCCESS) && (!_json_iter->is_single_token())) { return TRAILING_CONTENT; } + return answer; +} + +simdjson_inline simdjson_result value_iterator::get_root_number_type(bool check_trailing) noexcept { + auto max_len = peek_root_length(); + auto json = peek_root_scalar("number"); + // Per https://www.exploringbinary.com/maximum-number-of-decimal-digits-in-binary-floating-point-numbers/, + // 1074 is the maximum number of significant fractional digits. Add 8 more digits for the biggest + // number: -0.e-308. + uint8_t tmpbuf[1074+8+1+1]; + tmpbuf[1074+8+1] = '\0'; // make sure that buffer is always null terminated. + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf, 1074+8+1)) { + if(numberparsing::check_if_integer(json, max_len)) { + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + logger::log_error(*_json_iter, start_position(), depth(), "Found big integer"); + return number_type::big_integer; + } + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 1082 characters and not a big integer"); + return NUMBER_ERROR; + } + auto answer = numberparsing::get_number_type(tmpbuf); + if (check_trailing && (answer.error() == SUCCESS) && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + return answer; +} +simdjson_inline simdjson_result value_iterator::get_root_number(bool check_trailing) noexcept { + auto max_len = peek_root_length(); + auto json = peek_root_scalar("number"); + // Per https://www.exploringbinary.com/maximum-number-of-decimal-digits-in-binary-floating-point-numbers/, + // 1074 is the maximum number of significant fractional digits. Add 8 more digits for the biggest + // number: -0.e-308. + // NOTE: the current approach doesn't work for very big integer numbers containing more than 1074 digits. + uint8_t tmpbuf[1074+8+1+1]; + tmpbuf[1074+8+1] = '\0'; // make sure that buffer is always null terminated. + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf, 1074+8+1)) { + if(numberparsing::check_if_integer(json, max_len)) { + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + logger::log_error(*_json_iter, start_position(), depth(), "Found big integer"); + return BIGINT_ERROR; + } + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 1082 characters and not a big integer"); + return NUMBER_ERROR; + } + number num; + error_code error = numberparsing::parse_number(tmpbuf, num); + if(error) { return error; } + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + advance_root_scalar("number"); + return num; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_root_string(bool check_trailing, bool allow_replacement) noexcept { + return get_root_raw_json_string(check_trailing).unescape(json_iter(), allow_replacement); +} +template +simdjson_warn_unused simdjson_inline error_code value_iterator::get_root_string(string_type& receiver, bool check_trailing, bool allow_replacement) noexcept { + std::string_view content; + auto err = get_root_string(check_trailing, allow_replacement).get(content); + if (err) { return err; } + receiver = content; + return SUCCESS; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_root_wobbly_string(bool check_trailing) noexcept { + return get_root_raw_json_string(check_trailing).unescape_wobbly(json_iter()); +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_root_raw_json_string(bool check_trailing) noexcept { + auto json = peek_scalar("string"); + if (*json != '"') { return incorrect_type_error("Not a string"); } + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + advance_scalar("string"); + return raw_json_string(json+1); +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_root_uint64(bool check_trailing) noexcept { + auto max_len = peek_root_length(); + auto json = peek_root_scalar("uint64"); + uint8_t tmpbuf[20+1+1]{}; // <20 digits> is the longest possible unsigned integer + tmpbuf[20+1] = '\0'; // make sure that buffer is always null terminated. + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf, 20+1)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters"); + return NUMBER_ERROR; + } + auto result = numberparsing::parse_unsigned(tmpbuf); + if(result.error() == SUCCESS) { + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + advance_root_scalar("uint64"); + } + return result; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_root_uint64_in_string(bool check_trailing) noexcept { + auto max_len = peek_root_length(); + auto json = peek_root_scalar("uint64"); + uint8_t tmpbuf[20+1+1]{}; // <20 digits> is the longest possible unsigned integer + tmpbuf[20+1] = '\0'; // make sure that buffer is always null terminated. + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf, 20+1)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters"); + return NUMBER_ERROR; + } + auto result = numberparsing::parse_unsigned_in_string(tmpbuf); + if(result.error() == SUCCESS) { + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + advance_root_scalar("uint64"); + } + return result; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_root_int64(bool check_trailing) noexcept { + auto max_len = peek_root_length(); + auto json = peek_root_scalar("int64"); + uint8_t tmpbuf[20+1+1]; // -<19 digits> is the longest possible integer + tmpbuf[20+1] = '\0'; // make sure that buffer is always null terminated. + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf, 20+1)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters"); + return NUMBER_ERROR; + } + + auto result = numberparsing::parse_integer(tmpbuf); + if(result.error() == SUCCESS) { + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + advance_root_scalar("int64"); + } + return result; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_root_int64_in_string(bool check_trailing) noexcept { + auto max_len = peek_root_length(); + auto json = peek_root_scalar("int64"); + uint8_t tmpbuf[20+1+1]; // -<19 digits> is the longest possible integer + tmpbuf[20+1] = '\0'; // make sure that buffer is always null terminated. + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf, 20+1)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters"); + return NUMBER_ERROR; + } + + auto result = numberparsing::parse_integer_in_string(tmpbuf); + if(result.error() == SUCCESS) { + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + advance_root_scalar("int64"); + } + return result; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_root_double(bool check_trailing) noexcept { + auto max_len = peek_root_length(); + auto json = peek_root_scalar("double"); + // Per https://www.exploringbinary.com/maximum-number-of-decimal-digits-in-binary-floating-point-numbers/, + // 1074 is the maximum number of significant fractional digits. Add 8 more digits for the biggest + // number: -0.e-308. + uint8_t tmpbuf[1074+8+1+1]; // +1 for null termination. + tmpbuf[1074+8+1] = '\0'; // make sure that buffer is always null terminated. + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf, 1074+8+1)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 1082 characters"); + return NUMBER_ERROR; + } + auto result = numberparsing::parse_double(tmpbuf); + if(result.error() == SUCCESS) { + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + advance_root_scalar("double"); + } + return result; +} + +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_root_double_in_string(bool check_trailing) noexcept { + auto max_len = peek_root_length(); + auto json = peek_root_scalar("double"); + // Per https://www.exploringbinary.com/maximum-number-of-decimal-digits-in-binary-floating-point-numbers/, + // 1074 is the maximum number of significant fractional digits. Add 8 more digits for the biggest + // number: -0.e-308. + uint8_t tmpbuf[1074+8+1+1]; // +1 for null termination. + tmpbuf[1074+8+1] = '\0'; // make sure that buffer is always null terminated. + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf, 1074+8+1)) { + logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 1082 characters"); + return NUMBER_ERROR; + } + auto result = numberparsing::parse_double_in_string(tmpbuf); + if(result.error() == SUCCESS) { + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + advance_root_scalar("double"); + } + return result; +} +simdjson_warn_unused simdjson_inline simdjson_result value_iterator::get_root_bool(bool check_trailing) noexcept { + auto max_len = peek_root_length(); + auto json = peek_root_scalar("bool"); + uint8_t tmpbuf[5+1+1]; // +1 for null termination + tmpbuf[5+1] = '\0'; // make sure that buffer is always null terminated. + if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf, 5+1)) { return incorrect_type_error("Not a boolean"); } + auto result = parse_bool(tmpbuf); + if(result.error() == SUCCESS) { + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + advance_root_scalar("bool"); + } + return result; +} +simdjson_inline simdjson_result value_iterator::is_root_null(bool check_trailing) noexcept { + auto max_len = peek_root_length(); + auto json = peek_root_scalar("null"); + bool result = (max_len >= 4 && !atomparsing::str4ncmp(json, "null") && + (max_len == 4 || jsoncharutils::is_structural_or_whitespace(json[4]))); + if(result) { // we have something that looks like a null. + if (check_trailing && !_json_iter->is_single_token()) { return TRAILING_CONTENT; } + advance_root_scalar("null"); + } + return result; +} + +simdjson_warn_unused simdjson_inline error_code value_iterator::skip_child() noexcept { + SIMDJSON_ASSUME( _json_iter->token._position > _start_position ); + SIMDJSON_ASSUME( _json_iter->_depth >= _depth ); + + return _json_iter->skip_child(depth()); +} + +simdjson_inline value_iterator value_iterator::child() const noexcept { + assert_at_child(); + return { _json_iter, depth()+1, _json_iter->token.position() }; +} + +// GCC 7 warns when the first line of this function is inlined away into oblivion due to the caller +// relating depth and iterator depth, which is a desired effect. It does not happen if is_open is +// marked non-inline. +SIMDJSON_PUSH_DISABLE_WARNINGS +SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING +simdjson_inline bool value_iterator::is_open() const noexcept { + return _json_iter->depth() >= depth(); +} +SIMDJSON_POP_DISABLE_WARNINGS + +simdjson_inline bool value_iterator::at_end() const noexcept { + return _json_iter->at_end(); +} + +simdjson_inline bool value_iterator::at_start() const noexcept { + return _json_iter->token.position() == start_position(); +} + +simdjson_inline bool value_iterator::at_first_field() const noexcept { + SIMDJSON_ASSUME( _json_iter->token._position > _start_position ); + return _json_iter->token.position() == start_position() + 1; +} + +simdjson_inline void value_iterator::abandon() noexcept { + _json_iter->abandon(); +} + +simdjson_warn_unused simdjson_inline depth_t value_iterator::depth() const noexcept { + return _depth; +} +simdjson_warn_unused simdjson_inline error_code value_iterator::error() const noexcept { + return _json_iter->error; +} +simdjson_warn_unused simdjson_inline uint8_t *&value_iterator::string_buf_loc() noexcept { + return _json_iter->string_buf_loc(); +} +simdjson_warn_unused simdjson_inline const json_iterator &value_iterator::json_iter() const noexcept { + return *_json_iter; +} +simdjson_warn_unused simdjson_inline json_iterator &value_iterator::json_iter() noexcept { + return *_json_iter; +} + +simdjson_inline const uint8_t *value_iterator::peek_start() const noexcept { + return _json_iter->peek(start_position()); +} +simdjson_inline uint32_t value_iterator::peek_start_length() const noexcept { + return _json_iter->peek_length(start_position()); +} +simdjson_inline uint32_t value_iterator::peek_root_length() const noexcept { + return _json_iter->peek_root_length(start_position()); +} + +simdjson_inline const uint8_t *value_iterator::peek_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); + // If we're not at the position anymore, we don't want to advance the cursor. + if (!is_at_start()) { return peek_start(); } + + // Get the JSON and advance the cursor, decreasing depth to signify that we have retrieved the value. + assert_at_start(); + return _json_iter->peek(); +} + +simdjson_inline void value_iterator::advance_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); + // If we're not at the position anymore, we don't want to advance the cursor. + if (!is_at_start()) { return; } + + // Get the JSON and advance the cursor, decreasing depth to signify that we have retrieved the value. + assert_at_start(); + _json_iter->return_current_and_advance(); + _json_iter->ascend_to(depth()-1); +} + +simdjson_inline error_code value_iterator::start_container(uint8_t start_char, const char *incorrect_type_message, const char *type) noexcept { + logger::log_start_value(*_json_iter, start_position(), depth(), type); + // If we're not at the position anymore, we don't want to advance the cursor. + const uint8_t *json; + if (!is_at_start()) { +#if SIMDJSON_DEVELOPMENT_CHECKS + if (!is_at_iterator_start()) { return OUT_OF_ORDER_ITERATION; } +#endif + json = peek_start(); + if (*json != start_char) { return incorrect_type_error(incorrect_type_message); } + } else { + assert_at_start(); + /** + * We should be prudent. Let us peek. If it is not the right type, we + * return an error. Only once we have determined that we have the right + * type are we allowed to advance! + */ + json = _json_iter->peek(); + if (*json != start_char) { return incorrect_type_error(incorrect_type_message); } + _json_iter->return_current_and_advance(); + } + + + return SUCCESS; +} + + +simdjson_inline const uint8_t *value_iterator::peek_root_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); + if (!is_at_start()) { return peek_start(); } + + assert_at_root(); + return _json_iter->peek(); +} +simdjson_inline const uint8_t *value_iterator::peek_non_root_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); + if (!is_at_start()) { return peek_start(); } + + assert_at_non_root_start(); + return _json_iter->peek(); +} + +simdjson_inline void value_iterator::advance_root_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); + if (!is_at_start()) { return; } + + assert_at_root(); + _json_iter->return_current_and_advance(); + _json_iter->ascend_to(depth()-1); +} +simdjson_inline void value_iterator::advance_non_root_scalar(const char *type) noexcept { + logger::log_value(*_json_iter, start_position(), depth(), type); + if (!is_at_start()) { return; } + + assert_at_non_root_start(); + _json_iter->return_current_and_advance(); + _json_iter->ascend_to(depth()-1); +} + +simdjson_inline error_code value_iterator::incorrect_type_error(const char *message) const noexcept { + logger::log_error(*_json_iter, start_position(), depth(), message); + return INCORRECT_TYPE; +} + +simdjson_inline bool value_iterator::is_at_start() const noexcept { + return position() == start_position(); +} + +simdjson_inline bool value_iterator::is_at_key() const noexcept { + // Keys are at the same depth as the object. + // Note here that we could be safer and check that we are within an object, + // but we do not. + return _depth == _json_iter->_depth && *_json_iter->peek() == '"'; +} + +simdjson_inline bool value_iterator::is_at_iterator_start() const noexcept { + // We can legitimately be either at the first value ([1]), or after the array if it's empty ([]). + auto delta = position() - start_position(); + return delta == 1 || delta == 2; +} + +inline void value_iterator::assert_at_start() const noexcept { + SIMDJSON_ASSUME( _json_iter->token._position == _start_position ); + SIMDJSON_ASSUME( _json_iter->_depth == _depth ); + SIMDJSON_ASSUME( _depth > 0 ); +} + +inline void value_iterator::assert_at_container_start() const noexcept { + SIMDJSON_ASSUME( _json_iter->token._position == _start_position + 1 ); + SIMDJSON_ASSUME( _json_iter->_depth == _depth ); + SIMDJSON_ASSUME( _depth > 0 ); +} + +inline void value_iterator::assert_at_next() const noexcept { + SIMDJSON_ASSUME( _json_iter->token._position > _start_position ); + SIMDJSON_ASSUME( _json_iter->_depth == _depth ); + SIMDJSON_ASSUME( _depth > 0 ); +} + +simdjson_inline void value_iterator::move_at_start() noexcept { + _json_iter->_depth = _depth; + _json_iter->token.set_position(_start_position); +} + +simdjson_inline void value_iterator::move_at_container_start() noexcept { + _json_iter->_depth = _depth; + _json_iter->token.set_position(_start_position + 1); +} + +simdjson_inline simdjson_result value_iterator::reset_array() noexcept { + if(error()) { return error(); } + move_at_container_start(); + return started_array(); +} + +simdjson_inline simdjson_result value_iterator::reset_object() noexcept { + if(error()) { return error(); } + move_at_container_start(); + return started_object(); +} + +inline void value_iterator::assert_at_child() const noexcept { + SIMDJSON_ASSUME( _json_iter->token._position > _start_position ); + SIMDJSON_ASSUME( _json_iter->_depth == _depth + 1 ); + SIMDJSON_ASSUME( _depth > 0 ); +} + +inline void value_iterator::assert_at_root() const noexcept { + assert_at_start(); + SIMDJSON_ASSUME( _depth == 1 ); +} + +inline void value_iterator::assert_at_non_root_start() const noexcept { + assert_at_start(); + SIMDJSON_ASSUME( _depth > 1 ); +} + +inline void value_iterator::assert_is_valid() const noexcept { + SIMDJSON_ASSUME( _json_iter != nullptr ); +} + +simdjson_inline bool value_iterator::is_valid() const noexcept { + return _json_iter != nullptr; +} + +simdjson_inline simdjson_result value_iterator::type() const noexcept { + switch (*peek_start()) { + case '{': + return json_type::object; + case '[': + return json_type::array; + case '"': + return json_type::string; + case 'n': + return json_type::null; + case 't': case 'f': + return json_type::boolean; + case '-': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return json_type::number; + default: + return TAPE_ERROR; + } +} + +simdjson_inline token_position value_iterator::start_position() const noexcept { + return _start_position; +} + +simdjson_inline token_position value_iterator::position() const noexcept { + return _json_iter->position(); +} + +simdjson_inline token_position value_iterator::end_position() const noexcept { + return _json_iter->end_position(); +} + +simdjson_inline token_position value_iterator::last_position() const noexcept { + return _json_iter->last_position(); +} + +simdjson_inline error_code value_iterator::report_error(error_code error, const char *message) noexcept { + return _json_iter->report_error(error, message); +} + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +simdjson_inline simdjson_result::simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::value_iterator &&value) noexcept + : implementation_simdjson_result_base(std::forward(value)) {} +simdjson_inline simdjson_result::simdjson_result(error_code error) noexcept + : implementation_simdjson_result_base(error) {} + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_VALUE_ITERATOR_INL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/generic/ondemand/value_iterator.h b/contrib/libs/simdjson/include/simdjson/generic/ondemand/value_iterator.h new file mode 100644 index 000000000000..a01a8fb09e19 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/generic/ondemand/value_iterator.h @@ -0,0 +1,492 @@ +#ifndef SIMDJSON_GENERIC_ONDEMAND_VALUE_ITERATOR_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_GENERIC_ONDEMAND_VALUE_ITERATOR_H +#include "simdjson/generic/ondemand/base.h" +#include "simdjson/generic/implementation_simdjson_result_base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace ondemand { + +/** + * Iterates through a single JSON value at a particular depth. + * + * Does not keep track of the type of value: provides methods for objects, arrays and scalars and expects + * the caller to call the right ones. + * + * @private This is not intended for external use. + */ +class value_iterator { +protected: + /** The underlying JSON iterator */ + json_iterator *_json_iter{}; + /** The depth of this value */ + depth_t _depth{}; + /** + * The starting token index for this value + */ + token_position _start_position{}; + +public: + simdjson_inline value_iterator() noexcept = default; + + /** + * Denote that we're starting a document. + */ + simdjson_inline void start_document() noexcept; + + /** + * Skips a non-iterated or partially-iterated JSON value, whether it is a scalar, array or object. + * + * Optimized for scalars. + */ + simdjson_warn_unused simdjson_inline error_code skip_child() noexcept; + + /** + * Tell whether the iterator is at the EOF mark + */ + simdjson_inline bool at_end() const noexcept; + + /** + * Tell whether the iterator is at the start of the value + */ + simdjson_inline bool at_start() const noexcept; + + /** + * Tell whether the value is open--if the value has not been used, or the array/object is still open. + */ + simdjson_inline bool is_open() const noexcept; + + /** + * Tell whether the value is at an object's first field (just after the {). + */ + simdjson_inline bool at_first_field() const noexcept; + + /** + * Abandon all iteration. + */ + simdjson_inline void abandon() noexcept; + + /** + * Get the child value as a value_iterator. + */ + simdjson_inline value_iterator child_value() const noexcept; + + /** + * Get the depth of this value. + */ + simdjson_inline int32_t depth() const noexcept; + + /** + * Get the JSON type of this value. + * + * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse". + */ + simdjson_inline simdjson_result type() const noexcept; + + /** + * @addtogroup object Object iteration + * + * Methods to iterate and find object fields. These methods generally *assume* the value is + * actually an object; the caller is responsible for keeping track of that fact. + * + * @{ + */ + + /** + * Start an object iteration. + * + * @returns Whether the object had any fields (returns false for empty). + * @error INCORRECT_TYPE if there is no opening { + */ + simdjson_warn_unused simdjson_inline simdjson_result start_object() noexcept; + /** + * Start an object iteration from the root. + * + * @returns Whether the object had any fields (returns false for empty). + * @error INCORRECT_TYPE if there is no opening { + * @error TAPE_ERROR if there is no matching } at end of document + */ + simdjson_warn_unused simdjson_inline simdjson_result start_root_object() noexcept; + /** + * Checks whether an object could be started from the root. May be called by start_root_object. + * + * @returns SUCCESS if it is possible to safely start an object from the root (document level). + * @error INCORRECT_TYPE if there is no opening { + * @error TAPE_ERROR if there is no matching } at end of document + */ + simdjson_warn_unused simdjson_inline error_code check_root_object() noexcept; + /** + * Start an object iteration after the user has already checked and moved past the {. + * + * Does not move the iterator unless the object is empty ({}). + * + * @returns Whether the object had any fields (returns false for empty). + * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent* + * array or object is incomplete). + */ + simdjson_warn_unused simdjson_inline simdjson_result started_object() noexcept; + /** + * Start an object iteration from the root, after the user has already checked and moved past the {. + * + * Does not move the iterator unless the object is empty ({}). + * + * @returns Whether the object had any fields (returns false for empty). + * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent* + * array or object is incomplete). + */ + simdjson_warn_unused simdjson_inline simdjson_result started_root_object() noexcept; + + /** + * Moves to the next field in an object. + * + * Looks for , and }. If } is found, the object is finished and the iterator advances past it. + * Otherwise, it advances to the next value. + * + * @return whether there is another field in the object. + * @error TAPE_ERROR If there is a comma missing between fields. + * @error TAPE_ERROR If there is a comma, but not enough tokens remaining to have a key, :, and value. + */ + simdjson_warn_unused simdjson_inline simdjson_result has_next_field() noexcept; + + /** + * Get the current field's key. + */ + simdjson_warn_unused simdjson_inline simdjson_result field_key() noexcept; + + /** + * Pass the : in the field and move to its value. + */ + simdjson_warn_unused simdjson_inline error_code field_value() noexcept; + + /** + * Find the next field with the given key. + * + * Assumes you have called next_field() or otherwise matched the previous value. + * + * This means the iterator must be sitting at the next key: + * + * ``` + * { "a": 1, "b": 2 } + * ^ + * ``` + * + * Key is *raw JSON,* meaning it will be matched against the verbatim JSON without attempting to + * unescape it. This works well for typical ASCII and UTF-8 keys (almost all of them), but may + * fail to match some keys with escapes (\u, \n, etc.). + */ + simdjson_warn_unused simdjson_inline error_code find_field(const std::string_view key) noexcept; + + /** + * Find the next field with the given key, *without* unescaping. This assumes object order: it + * will not find the field if it was already passed when looking for some *other* field. + * + * Assumes you have called next_field() or otherwise matched the previous value. + * + * This means the iterator must be sitting at the next key: + * + * ``` + * { "a": 1, "b": 2 } + * ^ + * ``` + * + * Key is *raw JSON,* meaning it will be matched against the verbatim JSON without attempting to + * unescape it. This works well for typical ASCII and UTF-8 keys (almost all of them), but may + * fail to match some keys with escapes (\u, \n, etc.). + */ + simdjson_warn_unused simdjson_inline simdjson_result find_field_raw(const std::string_view key) noexcept; + + /** + * Find the field with the given key without regard to order, and *without* unescaping. + * + * This is an unordered object lookup: if the field is not found initially, it will cycle around and scan from the beginning. + * + * Assumes you have called next_field() or otherwise matched the previous value. + * + * This means the iterator must be sitting at the next key: + * + * ``` + * { "a": 1, "b": 2 } + * ^ + * ``` + * + * Key is *raw JSON,* meaning it will be matched against the verbatim JSON without attempting to + * unescape it. This works well for typical ASCII and UTF-8 keys (almost all of them), but may + * fail to match some keys with escapes (\u, \n, etc.). + */ + simdjson_warn_unused simdjson_inline simdjson_result find_field_unordered_raw(const std::string_view key) noexcept; + + /** @} */ + + /** + * @addtogroup array Array iteration + * Methods to iterate over array elements. These methods generally *assume* the value is actually + * an object; the caller is responsible for keeping track of that fact. + * @{ + */ + + /** + * Check for an opening [ and start an array iteration. + * + * @returns Whether the array had any elements (returns false for empty). + * @error INCORRECT_TYPE If there is no [. + */ + simdjson_warn_unused simdjson_inline simdjson_result start_array() noexcept; + /** + * Check for an opening [ and start an array iteration while at the root. + * + * @returns Whether the array had any elements (returns false for empty). + * @error INCORRECT_TYPE If there is no [. + * @error TAPE_ERROR if there is no matching ] at end of document + */ + simdjson_warn_unused simdjson_inline simdjson_result start_root_array() noexcept; + /** + * Checks whether an array could be started from the root. May be called by start_root_array. + * + * @returns SUCCESS if it is possible to safely start an array from the root (document level). + * @error INCORRECT_TYPE If there is no [. + * @error TAPE_ERROR if there is no matching ] at end of document + */ + simdjson_warn_unused simdjson_inline error_code check_root_array() noexcept; + /** + * Start an array iteration, after the user has already checked and moved past the [. + * + * Does not move the iterator unless the array is empty ([]). + * + * @returns Whether the array had any elements (returns false for empty). + * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent* + * array or object is incomplete). + */ + simdjson_warn_unused simdjson_inline simdjson_result started_array() noexcept; + /** + * Start an array iteration from the root, after the user has already checked and moved past the [. + * + * Does not move the iterator unless the array is empty ([]). + * + * @returns Whether the array had any elements (returns false for empty). + * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent* + * array or object is incomplete). + */ + simdjson_warn_unused simdjson_inline simdjson_result started_root_array() noexcept; + + /** + * Moves to the next element in an array. + * + * Looks for , and ]. If ] is found, the array is finished and the iterator advances past it. + * Otherwise, it advances to the next value. + * + * @return Whether there is another element in the array. + * @error TAPE_ERROR If there is a comma missing between elements. + */ + simdjson_warn_unused simdjson_inline simdjson_result has_next_element() noexcept; + + /** + * Get a child value iterator. + */ + simdjson_warn_unused simdjson_inline value_iterator child() const noexcept; + + /** @} */ + + /** + * @defgroup scalar Scalar values + * @addtogroup scalar + * @{ + */ + + simdjson_warn_unused simdjson_inline simdjson_result get_string(bool allow_replacement) noexcept; + template + simdjson_warn_unused simdjson_inline error_code get_string(string_type& receiver, bool allow_replacement) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_wobbly_string() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_raw_json_string() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_uint64() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_uint64_in_string() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_int64() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_int64_in_string() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_double() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_double_in_string() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_bool() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result is_null() noexcept; + simdjson_warn_unused simdjson_inline bool is_negative() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result is_integer() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_number_type() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_number() noexcept; + + simdjson_warn_unused simdjson_inline simdjson_result get_root_string(bool check_trailing, bool allow_replacement) noexcept; + template + simdjson_warn_unused simdjson_inline error_code get_root_string(string_type& receiver, bool check_trailing, bool allow_replacement) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_root_wobbly_string(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_root_raw_json_string(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_root_uint64(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_root_uint64_in_string(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_root_int64(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_root_int64_in_string(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_root_double(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_root_double_in_string(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_root_bool(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline bool is_root_negative() noexcept; + simdjson_warn_unused simdjson_inline simdjson_result is_root_integer(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_root_number_type(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result get_root_number(bool check_trailing) noexcept; + simdjson_warn_unused simdjson_inline simdjson_result is_root_null(bool check_trailing) noexcept; + + simdjson_inline error_code error() const noexcept; + simdjson_inline uint8_t *&string_buf_loc() noexcept; + simdjson_inline const json_iterator &json_iter() const noexcept; + simdjson_inline json_iterator &json_iter() noexcept; + + simdjson_inline void assert_is_valid() const noexcept; + simdjson_inline bool is_valid() const noexcept; + + /** @} */ +protected: + /** + * Restarts an array iteration. + * @returns Whether the array has any elements (returns false for empty). + */ + simdjson_inline simdjson_result reset_array() noexcept; + /** + * Restarts an object iteration. + * @returns Whether the object has any fields (returns false for empty). + */ + simdjson_inline simdjson_result reset_object() noexcept; + /** + * move_at_start(): moves us so that we are pointing at the beginning of + * the container. It updates the index so that at_start() is true and it + * syncs the depth. The user can then create a new container instance. + * + * Usage: used with value::count_elements(). + **/ + simdjson_inline void move_at_start() noexcept; + + /** + * move_at_container_start(): moves us so that we are pointing at the beginning of + * the container so that assert_at_container_start() passes. + * + * Usage: used with reset_array() and reset_object(). + **/ + simdjson_inline void move_at_container_start() noexcept; + /* Useful for debugging and logging purposes. */ + inline std::string to_string() const noexcept; + simdjson_inline value_iterator(json_iterator *json_iter, depth_t depth, token_position start_index) noexcept; + + simdjson_inline simdjson_result parse_null(const uint8_t *json) const noexcept; + simdjson_inline simdjson_result parse_bool(const uint8_t *json) const noexcept; + simdjson_inline const uint8_t *peek_start() const noexcept; + simdjson_inline uint32_t peek_start_length() const noexcept; + simdjson_inline uint32_t peek_root_length() const noexcept; + + /** + * The general idea of the advance_... methods and the peek_* methods + * is that you first peek and check that you have desired type. If you do, + * and only if you do, then you advance. + * + * We used to unconditionally advance. But this made reasoning about our + * current state difficult. + * Suppose you always advance. Look at the 'value' matching the key + * "shadowable" in the following example... + * + * ({"globals":{"a":{"shadowable":[}}}}) + * + * If the user thinks it is a Boolean and asks for it, then we check the '[', + * decide it is not a Boolean, but still move into the next character ('}'). Now + * we are left pointing at '}' right after a '['. And we have not yet reported + * an error, only that we do not have a Boolean. + * + * If, instead, you just stand your ground until it is content that you know, then + * you will only even move beyond the '[' if the user tells you that you have an + * array. So you will be at the '}' character inside the array and, hopefully, you + * will then catch the error because an array cannot start with '}', but the code + * processing Boolean values does not know this. + * + * So the contract is: first call 'peek_...' and then call 'advance_...' only + * if you have determined that it is a type you can handle. + * + * Unfortunately, it makes the code more verbose, longer and maybe more error prone. + */ + + simdjson_inline void advance_scalar(const char *type) noexcept; + simdjson_inline void advance_root_scalar(const char *type) noexcept; + simdjson_inline void advance_non_root_scalar(const char *type) noexcept; + + simdjson_inline const uint8_t *peek_scalar(const char *type) noexcept; + simdjson_inline const uint8_t *peek_root_scalar(const char *type) noexcept; + simdjson_inline const uint8_t *peek_non_root_scalar(const char *type) noexcept; + + + simdjson_inline error_code start_container(uint8_t start_char, const char *incorrect_type_message, const char *type) noexcept; + simdjson_inline error_code end_container() noexcept; + + /** + * Advance to a place expecting a value (increasing depth). + * + * @return The current token (the one left behind). + * @error TAPE_ERROR If the document ended early. + */ + simdjson_inline simdjson_result advance_to_value() noexcept; + + simdjson_inline error_code incorrect_type_error(const char *message) const noexcept; + simdjson_inline error_code error_unless_more_tokens(uint32_t tokens=1) const noexcept; + + simdjson_inline bool is_at_start() const noexcept; + /** + * is_at_iterator_start() returns true on an array or object after it has just been + * created, whether the instance is empty or not. + * + * Usage: used by array::begin() in debug mode (SIMDJSON_DEVELOPMENT_CHECKS) + */ + simdjson_inline bool is_at_iterator_start() const noexcept; + + /** + * Assuming that we are within an object, this returns true if we + * are pointing at a key. + * + * Usage: the skip_child() method should never be used while we are pointing + * at a key inside an object. + */ + simdjson_inline bool is_at_key() const noexcept; + + inline void assert_at_start() const noexcept; + inline void assert_at_container_start() const noexcept; + inline void assert_at_root() const noexcept; + inline void assert_at_child() const noexcept; + inline void assert_at_next() const noexcept; + inline void assert_at_non_root_start() const noexcept; + + /** Get the starting position of this value */ + simdjson_inline token_position start_position() const noexcept; + + /** @copydoc error_code json_iterator::position() const noexcept; */ + simdjson_inline token_position position() const noexcept; + /** @copydoc error_code json_iterator::end_position() const noexcept; */ + simdjson_inline token_position last_position() const noexcept; + /** @copydoc error_code json_iterator::end_position() const noexcept; */ + simdjson_inline token_position end_position() const noexcept; + /** @copydoc error_code json_iterator::report_error(error_code error, const char *message) noexcept; */ + simdjson_inline error_code report_error(error_code error, const char *message) noexcept; + + friend class document; + friend class object; + friend class array; + friend class value; + friend class field; +}; // value_iterator + +} // namespace ondemand +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +namespace simdjson { + +template<> +struct simdjson_result : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base { +public: + simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::value_iterator &&value) noexcept; ///< @private + simdjson_inline simdjson_result(error_code error) noexcept; ///< @private + simdjson_inline simdjson_result() noexcept = default; +}; + +} // namespace simdjson + +#endif // SIMDJSON_GENERIC_ONDEMAND_VALUE_ITERATOR_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/haswell.h b/contrib/libs/simdjson/include/simdjson/haswell.h new file mode 100644 index 000000000000..867b7a449115 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_HASWELL_H +#define SIMDJSON_HASWELL_H + +#include "simdjson/haswell/begin.h" +#include "simdjson/generic/amalgamated.h" +#include "simdjson/haswell/end.h" + +#endif // SIMDJSON_HASWELL_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/haswell/base.h b/contrib/libs/simdjson/include/simdjson/haswell/base.h new file mode 100644 index 000000000000..73cc3ef1b4a5 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell/base.h @@ -0,0 +1,27 @@ +#ifndef SIMDJSON_HASWELL_BASE_H +#define SIMDJSON_HASWELL_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_HASWELL +namespace simdjson { +/** + * Implementation for Haswell (Intel AVX2). + */ +namespace haswell { + +class implementation; + +namespace { +namespace simd { +template struct simd8; +template struct simd8x64; +} // namespace simd +} // unnamed namespace + +} // namespace haswell +} // namespace simdjson + +#endif // SIMDJSON_HASWELL_BASE_H diff --git a/contrib/libs/simdjson/include/simdjson/haswell/begin.h b/contrib/libs/simdjson/include/simdjson/haswell/begin.h new file mode 100644 index 000000000000..1be8c192c57c --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell/begin.h @@ -0,0 +1,14 @@ +#define SIMDJSON_IMPLEMENTATION haswell + +#include "simdjson/haswell/base.h" +#include "simdjson/haswell/intrinsics.h" + +#if !SIMDJSON_CAN_ALWAYS_RUN_HASWELL +SIMDJSON_TARGET_REGION("avx2,bmi,pclmul,lzcnt,popcnt") +#endif + +#include "simdjson/haswell/bitmanipulation.h" +#include "simdjson/haswell/bitmask.h" +#include "simdjson/haswell/numberparsing_defs.h" +#include "simdjson/haswell/simd.h" +#include "simdjson/haswell/stringparsing_defs.h" diff --git a/contrib/libs/simdjson/include/simdjson/haswell/bitmanipulation.h b/contrib/libs/simdjson/include/simdjson/haswell/bitmanipulation.h new file mode 100644 index 000000000000..9b0e59905c84 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell/bitmanipulation.h @@ -0,0 +1,71 @@ +#ifndef SIMDJSON_HASWELL_BITMANIPULATION_H +#define SIMDJSON_HASWELL_BITMANIPULATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/haswell/base.h" +#include "simdjson/haswell/intrinsics.h" +#include "simdjson/haswell/bitmask.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace haswell { +namespace { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +SIMDJSON_NO_SANITIZE_UNDEFINED +// This function can be used safely even if not all bytes have been +// initialized. +// See issue https://github.com/simdjson/simdjson/issues/1965 +SIMDJSON_NO_SANITIZE_MEMORY +simdjson_inline int trailing_zeroes(uint64_t input_num) { +#if SIMDJSON_REGULAR_VISUAL_STUDIO + return (int)_tzcnt_u64(input_num); +#else // SIMDJSON_REGULAR_VISUAL_STUDIO + //////// + // You might expect the next line to be equivalent to + // return (int)_tzcnt_u64(input_num); + // but the generated code differs and might be less efficient? + //////// + return __builtin_ctzll(input_num); +#endif // SIMDJSON_REGULAR_VISUAL_STUDIO +} + +/* result might be undefined when input_num is zero */ +simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return _blsr_u64(input_num); +} + +/* result might be undefined when input_num is zero */ +simdjson_inline int leading_zeroes(uint64_t input_num) { + return int(_lzcnt_u64(input_num)); +} + +#if SIMDJSON_REGULAR_VISUAL_STUDIO +simdjson_inline unsigned __int64 count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows in this kernel + return __popcnt64(input_num);// Visual Studio wants two underscores +} +#else +simdjson_inline long long int count_ones(uint64_t input_num) { + return _popcnt64(input_num); +} +#endif + +simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, + uint64_t *result) { +#if SIMDJSON_REGULAR_VISUAL_STUDIO + return _addcarry_u64(0, value1, value2, + reinterpret_cast(result)); +#else + return __builtin_uaddll_overflow(value1, value2, + reinterpret_cast(result)); +#endif +} + +} // unnamed namespace +} // namespace haswell +} // namespace simdjson + +#endif // SIMDJSON_HASWELL_BITMANIPULATION_H diff --git a/contrib/libs/simdjson/include/simdjson/haswell/bitmask.h b/contrib/libs/simdjson/include/simdjson/haswell/bitmask.h new file mode 100644 index 000000000000..310c3280bec7 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell/bitmask.h @@ -0,0 +1,30 @@ +#ifndef SIMDJSON_HASWELL_BITMASK_H +#define SIMDJSON_HASWELL_BITMASK_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/haswell/base.h" +#include "simdjson/haswell/intrinsics.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace haswell { +namespace { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) { + // There should be no such thing with a processor supporting avx2 + // but not clmul. + __m128i all_ones = _mm_set1_epi8('\xFF'); + __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0); + return _mm_cvtsi128_si64(result); +} + +} // unnamed namespace +} // namespace haswell +} // namespace simdjson + +#endif // SIMDJSON_HASWELL_BITMASK_H diff --git a/contrib/libs/simdjson/include/simdjson/haswell/end.h b/contrib/libs/simdjson/include/simdjson/haswell/end.h new file mode 100644 index 000000000000..421df3653c94 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell/end.h @@ -0,0 +1,9 @@ +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/haswell/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#if !SIMDJSON_CAN_ALWAYS_RUN_HASWELL +SIMDJSON_UNTARGET_REGION +#endif + +#undef SIMDJSON_IMPLEMENTATION diff --git a/contrib/libs/simdjson/include/simdjson/haswell/implementation.h b/contrib/libs/simdjson/include/simdjson/haswell/implementation.h new file mode 100644 index 000000000000..6861e42983c1 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell/implementation.h @@ -0,0 +1,36 @@ +#ifndef SIMDJSON_HASWELL_IMPLEMENTATION_H +#define SIMDJSON_HASWELL_IMPLEMENTATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/haswell/base.h" +#include "simdjson/implementation.h" +#include "simdjson/internal/instruction_set.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_HASWELL +namespace simdjson { +namespace haswell { + +/** + * @private + */ +class implementation final : public simdjson::implementation { +public: + simdjson_inline implementation() : simdjson::implementation( + "haswell", + "Intel/AMD AVX2", + internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 + ) {} + simdjson_warn_unused error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final; + simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; +}; + +} // namespace haswell +} // namespace simdjson + +#endif // SIMDJSON_HASWELL_IMPLEMENTATION_H diff --git a/contrib/libs/simdjson/include/simdjson/haswell/intrinsics.h b/contrib/libs/simdjson/include/simdjson/haswell/intrinsics.h new file mode 100644 index 000000000000..a4593dbb6987 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell/intrinsics.h @@ -0,0 +1,52 @@ +#ifndef SIMDJSON_HASWELL_INTRINSICS_H +#define SIMDJSON_HASWELL_INTRINSICS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/haswell/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#if SIMDJSON_VISUAL_STUDIO +// under clang within visual studio, this will include +#include // visual studio or clang +#else +#include // elsewhere +#endif // SIMDJSON_VISUAL_STUDIO + +#if SIMDJSON_CLANG_VISUAL_STUDIO +/** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + * e.g., if __AVX2__ is set... in turn, we normally set these + * macros by compiling against the corresponding architecture + * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole + * software with these advanced instructions. In simdjson, we + * want to compile the whole program for a generic target, + * and only target our specific kernels. As a workaround, + * we directly include the needed headers. These headers would + * normally guard against such usage, but we carefully included + * (or ) before, so the headers + * are fooled. + */ +#include // for _blsr_u64 +#include // for __lzcnt64 +#include // for most things (AVX2, AVX512, _popcnt64) +#include +#include +#include +#include +#include // for _mm_clmulepi64_si128 +// unfortunately, we may not get _blsr_u64, but, thankfully, clang +// has it as a macro. +#ifndef _blsr_u64 +// we roll our own +#define _blsr_u64(n) ((n - 1) & n) +#endif // _blsr_u64 +#endif // SIMDJSON_CLANG_VISUAL_STUDIO + +static_assert(sizeof(__m256i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for haswell kernel."); + +#endif // SIMDJSON_HASWELL_INTRINSICS_H diff --git a/contrib/libs/simdjson/include/simdjson/haswell/numberparsing_defs.h b/contrib/libs/simdjson/include/simdjson/haswell/numberparsing_defs.h new file mode 100644 index 000000000000..5673e5e4f814 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell/numberparsing_defs.h @@ -0,0 +1,61 @@ +#ifndef SIMDJSON_HASWELL_NUMBERPARSING_DEFS_H +#define SIMDJSON_HASWELL_NUMBERPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/haswell/base.h" +#include "simdjson/haswell/intrinsics.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/internal/numberparsing_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace haswell { +namespace numberparsing { + +/** @private */ +static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) { + // this actually computes *16* values so we are being wasteful. + const __m128i ascii0 = _mm_set1_epi8('0'); + const __m128i mul_1_10 = + _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1); + const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); + const __m128i mul_1_10000 = + _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1); + const __m128i input = _mm_sub_epi8( + _mm_loadu_si128(reinterpret_cast(chars)), ascii0); + const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10); + const __m128i t2 = _mm_madd_epi16(t1, mul_1_100); + const __m128i t3 = _mm_packus_epi32(t2, t2); + const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000); + return _mm_cvtsi128_si32( + t4); // only captures the sum of the first 8 digits, drop the rest +} + +/** @private */ +simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) { + internal::value128 answer; +#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS +#if SIMDJSON_IS_ARM64 + // ARM64 has native support for 64-bit multiplications, no need to emultate + answer.high = __umulh(value1, value2); + answer.low = value1 * value2; +#else + answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64 +#endif // SIMDJSON_IS_ARM64 +#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS + __uint128_t r = (static_cast<__uint128_t>(value1)) * value2; + answer.low = uint64_t(r); + answer.high = uint64_t(r >> 64); +#endif + return answer; +} + +} // namespace numberparsing +} // namespace haswell +} // namespace simdjson + +#define SIMDJSON_SWAR_NUMBER_PARSING 1 + +#endif // SIMDJSON_HASWELL_NUMBERPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/haswell/ondemand.h b/contrib/libs/simdjson/include/simdjson/haswell/ondemand.h new file mode 100644 index 000000000000..b3aa993efff0 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell/ondemand.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_HASWELL_ONDEMAND_H +#define SIMDJSON_HASWELL_ONDEMAND_H + +#include "simdjson/haswell/begin.h" +#include "simdjson/generic/ondemand/amalgamated.h" +#include "simdjson/haswell/end.h" + +#endif // SIMDJSON_HASWELL_ONDEMAND_H diff --git a/contrib/libs/simdjson/include/simdjson/haswell/simd.h b/contrib/libs/simdjson/include/simdjson/haswell/simd.h new file mode 100644 index 000000000000..9c40f4f4fdd7 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell/simd.h @@ -0,0 +1,372 @@ +#ifndef SIMDJSON_HASWELL_SIMD_H +#define SIMDJSON_HASWELL_SIMD_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/haswell/base.h" +#include "simdjson/haswell/intrinsics.h" +#include "simdjson/haswell/bitmanipulation.h" +#include "simdjson/internal/simdprune_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace haswell { +namespace { +namespace simd { + + // Forward-declared so they can be used by splat and friends. + template + struct base { + __m256i value; + + // Zero constructor + simdjson_inline base() : value{__m256i()} {} + + // Conversion from SIMD register + simdjson_inline base(const __m256i _value) : value(_value) {} + + // Conversion to SIMD register + simdjson_inline operator const __m256i&() const { return this->value; } + simdjson_inline operator __m256i&() { return this->value; } + + // Bit operations + simdjson_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); } + simdjson_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); } + simdjson_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); } + simdjson_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); } + simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast | other; return *this_cast; } + simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast & other; return *this_cast; } + simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast ^ other; return *this_cast; } + }; + + // Forward-declared so they can be used by splat and friends. + template + struct simd8; + + template> + struct base8: base> { + typedef uint32_t bitmask_t; + typedef uint64_t bitmask2_t; + + simdjson_inline base8() : base>() {} + simdjson_inline base8(const __m256i _value) : base>(_value) {} + + friend simdjson_really_inline Mask operator==(const simd8 lhs, const simd8 rhs) { return _mm256_cmpeq_epi8(lhs, rhs); } + + static const int SIZE = sizeof(base::value); + + template + simdjson_inline simd8 prev(const simd8 prev_chunk) const { + return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base8 { + static simdjson_inline simd8 splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); } + + simdjson_inline simd8() : base8() {} + simdjson_inline simd8(const __m256i _value) : base8(_value) {} + // Splat constructor + simdjson_inline simd8(bool _value) : base8(splat(_value)) {} + + simdjson_inline int to_bitmask() const { return _mm256_movemask_epi8(*this); } + simdjson_inline bool any() const { return !_mm256_testz_si256(*this, *this); } + simdjson_inline simd8 operator~() const { return *this ^ true; } + }; + + template + struct base8_numeric: base8 { + static simdjson_inline simd8 splat(T _value) { return _mm256_set1_epi8(_value); } + static simdjson_inline simd8 zero() { return _mm256_setzero_si256(); } + static simdjson_inline simd8 load(const T values[32]) { + return _mm256_loadu_si256(reinterpret_cast(values)); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdjson_inline simd8 repeat_16( + T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + simdjson_inline base8_numeric() : base8() {} + simdjson_inline base8_numeric(const __m256i _value) : base8(_value) {} + + // Store to array + simdjson_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); } + + // Addition/subtraction are the same for signed and unsigned + simdjson_inline simd8 operator+(const simd8 other) const { return _mm256_add_epi8(*this, other); } + simdjson_inline simd8 operator-(const simd8 other) const { return _mm256_sub_epi8(*this, other); } + simdjson_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *static_cast*>(this); } + simdjson_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *static_cast*>(this); } + + // Override to distinguish from bool version + simdjson_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdjson_inline simd8 lookup_16(simd8 lookup_table) const { + return _mm256_shuffle_epi8(lookup_table, *this); + } + + // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). + // Passing a 0 value for mask would be equivalent to writing out every byte to output. + // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes + // get written. + // Design consideration: it seems like a function with the + // signature simd8 compress(uint32_t mask) would be + // sensible, but the AVX ISA makes this kind of approach difficult. + template + simdjson_inline void compress(uint32_t mask, L * output) const { + using internal::thintable_epi8; + using internal::BitsSetTable256mul2; + using internal::pshufb_combine_table; + // this particular implementation was inspired by work done by @animetosho + // we do it in four steps, first 8 bytes and then second 8 bytes... + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // second least significant 8 bits + uint8_t mask3 = uint8_t(mask >> 16); // ... + uint8_t mask4 = uint8_t(mask >> 24); // ... + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. + __m256i shufmask = _mm256_set_epi64x(thintable_epi8[mask4], thintable_epi8[mask3], + thintable_epi8[mask2], thintable_epi8[mask1]); + // we increment by 0x08 the second half of the mask and so forth + shufmask = + _mm256_add_epi8(shufmask, _mm256_set_epi32(0x18181818, 0x18181818, + 0x10101010, 0x10101010, 0x08080808, 0x08080808, 0, 0)); + // this is the version "nearly pruned" + __m256i pruned = _mm256_shuffle_epi8(*this, shufmask); + // we still need to put the pieces back together. + // we compute the popcount of the first words: + int pop1 = BitsSetTable256mul2[mask1]; + int pop3 = BitsSetTable256mul2[mask3]; + + // then load the corresponding mask + // could be done with _mm256_loadu2_m128i but many standard libraries omit this intrinsic. + __m256i v256 = _mm256_castsi128_si256( + _mm_loadu_si128(reinterpret_cast(pshufb_combine_table + pop1 * 8))); + __m256i compactmask = _mm256_insertf128_si256(v256, + _mm_loadu_si128(reinterpret_cast(pshufb_combine_table + pop3 * 8)), 1); + __m256i almostthere = _mm256_shuffle_epi8(pruned, compactmask); + // We just need to write out the result. + // This is the tricky bit that is hard to do + // if we want to return a SIMD register, since there + // is no single-instruction approach to recombine + // the two 128-bit lanes with an offset. + __m128i v128; + v128 = _mm256_castsi256_si128(almostthere); + _mm_storeu_si128( reinterpret_cast<__m128i *>(output), v128); + v128 = _mm256_extractf128_si256(almostthere, 1); + _mm_storeu_si128( reinterpret_cast<__m128i *>(output + 16 - count_ones(mask & 0xFFFF)), v128); + } + + template + simdjson_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + }; + + // Signed bytes + template<> + struct simd8 : base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m256i _value) : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const int8_t values[32]) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, + int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23, + int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31 + ) : simd8(_mm256_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v16,v17,v18,v19,v20,v21,v22,v23, + v24,v25,v26,v27,v28,v29,v30,v31 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Order-sensitive comparisons + simdjson_inline simd8 max_val(const simd8 other) const { return _mm256_max_epi8(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return _mm256_min_epi8(*this, other); } + simdjson_inline simd8 operator>(const simd8 other) const { return _mm256_cmpgt_epi8(*this, other); } + simdjson_inline simd8 operator<(const simd8 other) const { return _mm256_cmpgt_epi8(other, *this); } + }; + + // Unsigned bytes + template<> + struct simd8: base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m256i _value) : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const uint8_t values[32]) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15, + uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23, + uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31 + ) : simd8(_mm256_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v16,v17,v18,v19,v20,v21,v22,v23, + v24,v25,v26,v27,v28,v29,v30,v31 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Saturated math + simdjson_inline simd8 saturating_add(const simd8 other) const { return _mm256_adds_epu8(*this, other); } + simdjson_inline simd8 saturating_sub(const simd8 other) const { return _mm256_subs_epu8(*this, other); } + + // Order-specific operations + simdjson_inline simd8 max_val(const simd8 other) const { return _mm256_max_epu8(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return _mm256_min_epu8(other, *this); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } + simdjson_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } + simdjson_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } + simdjson_inline simd8 operator>(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + simdjson_inline simd8 operator<(const simd8 other) const { return this->lt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdjson_inline simd8 bits_not_set() const { return *this == uint8_t(0); } + simdjson_inline simd8 bits_not_set(simd8 bits) const { return (*this & bits).bits_not_set(); } + simdjson_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } + simdjson_inline simd8 any_bits_set(simd8 bits) const { return ~this->bits_not_set(bits); } + simdjson_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; } + simdjson_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); } + simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdjson_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm256_testz_si256(*this, bits); } + simdjson_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } + template + simdjson_inline simd8 shr() const { return simd8(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } + template + simdjson_inline simd8 shl() const { return simd8(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + simdjson_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); } + }; + + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8& other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdjson_inline simd8x64(const simd8 chunk0, const simd8 chunk1) : chunks{chunk0, chunk1} {} + simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr), simd8::load(ptr+32)} {} + + simdjson_inline uint64_t compress(uint64_t mask, T * output) const { + uint32_t mask1 = uint32_t(mask); + uint32_t mask2 = uint32_t(mask >> 32); + this->chunks[0].compress(mask1, output); + this->chunks[1].compress(mask2, output + 32 - count_ones(mask1)); + return 64 - count_ones(mask); + } + + simdjson_inline void store(T ptr[64]) const { + this->chunks[0].store(ptr+sizeof(simd8)*0); + this->chunks[1].store(ptr+sizeof(simd8)*1); + } + + simdjson_inline uint64_t to_bitmask() const { + uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r_hi = this->chunks[1].to_bitmask(); + return r_lo | (r_hi << 32); + } + + simdjson_inline simd8 reduce_or() const { + return this->chunks[0] | this->chunks[1]; + } + + simdjson_inline simd8x64 bit_or(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] | mask, + this->chunks[1] | mask + ); + } + + simdjson_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] == mask, + this->chunks[1] == mask + ).to_bitmask(); + } + + simdjson_inline uint64_t eq(const simd8x64 &other) const { + return simd8x64( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1] + ).to_bitmask(); + } + + simdjson_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] <= mask, + this->chunks[1] <= mask + ).to_bitmask(); + } + }; // struct simd8x64 + +} // namespace simd + +} // unnamed namespace +} // namespace haswell +} // namespace simdjson + +#endif // SIMDJSON_HASWELL_SIMD_H diff --git a/contrib/libs/simdjson/include/simdjson/haswell/stringparsing_defs.h b/contrib/libs/simdjson/include/simdjson/haswell/stringparsing_defs.h new file mode 100644 index 000000000000..f896a10e2c10 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/haswell/stringparsing_defs.h @@ -0,0 +1,48 @@ +#ifndef SIMDJSON_HASWELL_STRINGPARSING_DEFS_H +#define SIMDJSON_HASWELL_STRINGPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/haswell/base.h" +#include "simdjson/haswell/simd.h" +#include "simdjson/haswell/bitmanipulation.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace haswell { +namespace { + +using namespace simd; + +// Holds backslashes and quotes locations. +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 32; + simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst); + + simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; } + simdjson_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; } + simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); } + simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); } + + uint32_t bs_bits; + uint32_t quote_bits; +}; // struct backslash_and_quote + +simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { + // this can read up to 15 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes"); + simd8 v(src); + // store to dest unconditionally - we can overwrite the bits we don't like later + v.store(dst); + return { + static_cast((v == '\\').to_bitmask()), // bs_bits + static_cast((v == '"').to_bitmask()), // quote_bits + }; +} + +} // unnamed namespace +} // namespace haswell +} // namespace simdjson + +#endif // SIMDJSON_HASWELL_STRINGPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/icelake.h b/contrib/libs/simdjson/include/simdjson/icelake.h new file mode 100644 index 000000000000..964296034064 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_ICELAKE_H +#define SIMDJSON_ICELAKE_H + +#include "simdjson/icelake/begin.h" +#include "simdjson/generic/amalgamated.h" +#include "simdjson/icelake/end.h" + +#endif // SIMDJSON_ICELAKE_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/icelake/base.h b/contrib/libs/simdjson/include/simdjson/icelake/base.h new file mode 100644 index 000000000000..f44c0d32a1f1 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake/base.h @@ -0,0 +1,20 @@ +#ifndef SIMDJSON_ICELAKE_BASE_H +#define SIMDJSON_ICELAKE_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_ICELAKE +namespace simdjson { +/** + * Implementation for Icelake (Intel AVX512). + */ +namespace icelake { + +class implementation; + +} // namespace icelake +} // namespace simdjson + +#endif // SIMDJSON_ICELAKE_BASE_H diff --git a/contrib/libs/simdjson/include/simdjson/icelake/begin.h b/contrib/libs/simdjson/include/simdjson/icelake/begin.h new file mode 100644 index 000000000000..60fe2cd6bf94 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake/begin.h @@ -0,0 +1,13 @@ +#define SIMDJSON_IMPLEMENTATION icelake +#include "simdjson/icelake/base.h" +#include "simdjson/icelake/intrinsics.h" + +#if !SIMDJSON_CAN_ALWAYS_RUN_ICELAKE +SIMDJSON_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,pclmul,lzcnt,popcnt") +#endif + +#include "simdjson/icelake/bitmanipulation.h" +#include "simdjson/icelake/bitmask.h" +#include "simdjson/icelake/simd.h" +#include "simdjson/icelake/stringparsing_defs.h" +#include "simdjson/icelake/numberparsing_defs.h" diff --git a/contrib/libs/simdjson/include/simdjson/icelake/bitmanipulation.h b/contrib/libs/simdjson/include/simdjson/icelake/bitmanipulation.h new file mode 100644 index 000000000000..5bcf7116015f --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake/bitmanipulation.h @@ -0,0 +1,70 @@ +#ifndef SIMDJSON_ICELAKE_BITMANIPULATION_H +#define SIMDJSON_ICELAKE_BITMANIPULATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/icelake/base.h" +#include "simdjson/icelake/intrinsics.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace icelake { +namespace { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +SIMDJSON_NO_SANITIZE_UNDEFINED +// This function can be used safely even if not all bytes have been +// initialized. +// See issue https://github.com/simdjson/simdjson/issues/1965 +SIMDJSON_NO_SANITIZE_MEMORY +simdjson_inline int trailing_zeroes(uint64_t input_num) { +#if SIMDJSON_REGULAR_VISUAL_STUDIO + return (int)_tzcnt_u64(input_num); +#else // SIMDJSON_REGULAR_VISUAL_STUDIO + //////// + // You might expect the next line to be equivalent to + // return (int)_tzcnt_u64(input_num); + // but the generated code differs and might be less efficient? + //////// + return __builtin_ctzll(input_num); +#endif // SIMDJSON_REGULAR_VISUAL_STUDIO +} + +/* result might be undefined when input_num is zero */ +simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return _blsr_u64(input_num); +} + +/* result might be undefined when input_num is zero */ +simdjson_inline int leading_zeroes(uint64_t input_num) { + return int(_lzcnt_u64(input_num)); +} + +#if SIMDJSON_REGULAR_VISUAL_STUDIO +simdjson_inline unsigned __int64 count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows + return __popcnt64(input_num);// Visual Studio wants two underscores +} +#else +simdjson_inline long long int count_ones(uint64_t input_num) { + return _popcnt64(input_num); +} +#endif + +simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, + uint64_t *result) { +#if SIMDJSON_REGULAR_VISUAL_STUDIO + return _addcarry_u64(0, value1, value2, + reinterpret_cast(result)); +#else + return __builtin_uaddll_overflow(value1, value2, + reinterpret_cast(result)); +#endif +} + +} // unnamed namespace +} // namespace icelake +} // namespace simdjson + +#endif // SIMDJSON_ICELAKE_BITMANIPULATION_H diff --git a/contrib/libs/simdjson/include/simdjson/icelake/bitmask.h b/contrib/libs/simdjson/include/simdjson/icelake/bitmask.h new file mode 100644 index 000000000000..ed55962fa490 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake/bitmask.h @@ -0,0 +1,30 @@ +#ifndef SIMDJSON_ICELAKE_BITMASK_H +#define SIMDJSON_ICELAKE_BITMASK_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/icelake/base.h" +#include "simdjson/icelake/intrinsics.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace icelake { +namespace { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) { + // There should be no such thing with a processor supporting avx2 + // but not clmul. + __m128i all_ones = _mm_set1_epi8('\xFF'); + __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0); + return _mm_cvtsi128_si64(result); +} + +} // unnamed namespace +} // namespace icelake +} // namespace simdjson + +#endif // SIMDJSON_ICELAKE_BITMASK_H diff --git a/contrib/libs/simdjson/include/simdjson/icelake/end.h b/contrib/libs/simdjson/include/simdjson/icelake/end.h new file mode 100644 index 000000000000..2accd5fb0d50 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake/end.h @@ -0,0 +1,9 @@ +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/icelake/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#if !SIMDJSON_CAN_ALWAYS_RUN_ICELAKE +SIMDJSON_UNTARGET_REGION +#endif + +#undef SIMDJSON_IMPLEMENTATION diff --git a/contrib/libs/simdjson/include/simdjson/icelake/implementation.h b/contrib/libs/simdjson/include/simdjson/icelake/implementation.h new file mode 100644 index 000000000000..940c5f9923cc --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake/implementation.h @@ -0,0 +1,36 @@ +#ifndef SIMDJSON_ICELAKE_IMPLEMENTATION_H +#define SIMDJSON_ICELAKE_IMPLEMENTATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/icelake/base.h" +#include "simdjson/implementation.h" +#include "simdjson/internal/instruction_set.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_ICELAKE +namespace simdjson { +namespace icelake { + +/** + * @private + */ +class implementation final : public simdjson::implementation { +public: + simdjson_inline implementation() : simdjson::implementation( + "icelake", + "Intel/AMD AVX512", + internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512F | internal::instruction_set::AVX512DQ | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 + ) {} + simdjson_warn_unused error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final; + simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; +}; + +} // namespace icelake +} // namespace simdjson + +#endif // SIMDJSON_ICELAKE_IMPLEMENTATION_H diff --git a/contrib/libs/simdjson/include/simdjson/icelake/intrinsics.h b/contrib/libs/simdjson/include/simdjson/icelake/intrinsics.h new file mode 100644 index 000000000000..4c68c290c1e6 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake/intrinsics.h @@ -0,0 +1,60 @@ +#ifndef SIMDJSON_ICELAKE_INTRINSICS_H +#define SIMDJSON_ICELAKE_INTRINSICS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/icelake/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#if SIMDJSON_VISUAL_STUDIO +// under clang within visual studio, this will include +#include // visual studio or clang +#else +#include // elsewhere +#endif // SIMDJSON_VISUAL_STUDIO + +#if SIMDJSON_CLANG_VISUAL_STUDIO +/** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + * e.g., if __AVX2__ is set... in turn, we normally set these + * macros by compiling against the corresponding architecture + * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole + * software with these advanced instructions. In simdjson, we + * want to compile the whole program for a generic target, + * and only target our specific kernels. As a workaround, + * we directly include the needed headers. These headers would + * normally guard against such usage, but we carefully included + * (or ) before, so the headers + * are fooled. + */ +#include // for _blsr_u64 +#include // for __lzcnt64 +#include // for most things (AVX2, AVX512, _popcnt64) +#include +#include +#include +#include +#include // for _mm_clmulepi64_si128 +// Important: we need the AVX-512 headers: +#include +#include +#include +#include +#include +#include +#include +// unfortunately, we may not get _blsr_u64, but, thankfully, clang +// has it as a macro. +#ifndef _blsr_u64 +// we roll our own +#define _blsr_u64(n) ((n - 1) & n) +#endif // _blsr_u64 +#endif // SIMDJSON_CLANG_VISUAL_STUDIO + +static_assert(sizeof(__m512i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for icelake"); + +#endif // SIMDJSON_ICELAKE_INTRINSICS_H diff --git a/contrib/libs/simdjson/include/simdjson/icelake/numberparsing_defs.h b/contrib/libs/simdjson/include/simdjson/icelake/numberparsing_defs.h new file mode 100644 index 000000000000..b095cf6c66c1 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake/numberparsing_defs.h @@ -0,0 +1,57 @@ +#ifndef SIMDJSON_ICELAKE_NUMBERPARSING_DEFS_H +#define SIMDJSON_ICELAKE_NUMBERPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/icelake/base.h" +#include "simdjson/icelake/intrinsics.h" +#include "simdjson/internal/numberparsing_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace icelake { +namespace numberparsing { + +static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) { + // this actually computes *16* values so we are being wasteful. + const __m128i ascii0 = _mm_set1_epi8('0'); + const __m128i mul_1_10 = + _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1); + const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); + const __m128i mul_1_10000 = + _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1); + const __m128i input = _mm_sub_epi8( + _mm_loadu_si128(reinterpret_cast(chars)), ascii0); + const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10); + const __m128i t2 = _mm_madd_epi16(t1, mul_1_100); + const __m128i t3 = _mm_packus_epi32(t2, t2); + const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000); + return _mm_cvtsi128_si32( + t4); // only captures the sum of the first 8 digits, drop the rest +} + +/** @private */ +simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) { + internal::value128 answer; +#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS +#if SIMDJSON_IS_ARM64 + // ARM64 has native support for 64-bit multiplications, no need to emultate + answer.high = __umulh(value1, value2); + answer.low = value1 * value2; +#else + answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64 +#endif // SIMDJSON_IS_ARM64 +#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS + __uint128_t r = (static_cast<__uint128_t>(value1)) * value2; + answer.low = uint64_t(r); + answer.high = uint64_t(r >> 64); +#endif + return answer; +} + +} // namespace numberparsing +} // namespace icelake +} // namespace simdjson + +#define SIMDJSON_SWAR_NUMBER_PARSING 1 + +#endif // SIMDJSON_ICELAKE_NUMBERPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/icelake/ondemand.h b/contrib/libs/simdjson/include/simdjson/icelake/ondemand.h new file mode 100644 index 000000000000..e2f13b4787a3 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake/ondemand.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_ICELAKE_ONDEMAND_H +#define SIMDJSON_ICELAKE_ONDEMAND_H + +#include "simdjson/icelake/begin.h" +#include "simdjson/generic/ondemand/amalgamated.h" +#include "simdjson/icelake/end.h" + +#endif // SIMDJSON_ICELAKE_ONDEMAND_H diff --git a/contrib/libs/simdjson/include/simdjson/icelake/simd.h b/contrib/libs/simdjson/include/simdjson/icelake/simd.h new file mode 100644 index 000000000000..04203f4b9a57 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake/simd.h @@ -0,0 +1,372 @@ +#ifndef SIMDJSON_ICELAKE_SIMD_H +#define SIMDJSON_ICELAKE_SIMD_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/icelake/base.h" +#include "simdjson/icelake/intrinsics.h" +#include "simdjson/icelake/bitmanipulation.h" +#include "simdjson/internal/simdprune_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#if defined(__GNUC__) && !defined(__clang__) +#if __GNUC__ == 8 +#define SIMDJSON_GCC8 1 +#endif // __GNUC__ == 8 +#endif // defined(__GNUC__) && !defined(__clang__) + +#if SIMDJSON_GCC8 +/** + * GCC 8 fails to provide _mm512_set_epi8. We roll our own. + */ +inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) { + return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56), + uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56), + uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56), + uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56), + uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56), + uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56), + uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56), + uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56)); +} +#endif // SIMDJSON_GCC8 + + + +namespace simdjson { +namespace icelake { +namespace { +namespace simd { + + // Forward-declared so they can be used by splat and friends. + template + struct base { + __m512i value; + + // Zero constructor + simdjson_inline base() : value{__m512i()} {} + + // Conversion from SIMD register + simdjson_inline base(const __m512i _value) : value(_value) {} + + // Conversion to SIMD register + simdjson_inline operator const __m512i&() const { return this->value; } + simdjson_inline operator __m512i&() { return this->value; } + + // Bit operations + simdjson_inline Child operator|(const Child other) const { return _mm512_or_si512(*this, other); } + simdjson_inline Child operator&(const Child other) const { return _mm512_and_si512(*this, other); } + simdjson_inline Child operator^(const Child other) const { return _mm512_xor_si512(*this, other); } + simdjson_inline Child bit_andnot(const Child other) const { return _mm512_andnot_si512(other, *this); } + simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast | other; return *this_cast; } + simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast & other; return *this_cast; } + simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast ^ other; return *this_cast; } + }; + + // Forward-declared so they can be used by splat and friends. + template + struct simd8; + + template> + struct base8: base> { + typedef uint32_t bitmask_t; + typedef uint64_t bitmask2_t; + + simdjson_inline base8() : base>() {} + simdjson_inline base8(const __m512i _value) : base>(_value) {} + + friend simdjson_really_inline uint64_t operator==(const simd8 lhs, const simd8 rhs) { + return _mm512_cmpeq_epi8_mask(lhs, rhs); + } + + static const int SIZE = sizeof(base::value); + + template + simdjson_inline simd8 prev(const simd8 prev_chunk) const { + // workaround for compilers unable to figure out that 16 - N is a constant (GCC 8) + constexpr int shift = 16 - N; + return _mm512_alignr_epi8(*this, _mm512_permutex2var_epi64(prev_chunk, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), *this), shift); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base8 { + static simdjson_inline simd8 splat(bool _value) { return _mm512_set1_epi8(uint8_t(-(!!_value))); } + + simdjson_inline simd8() : base8() {} + simdjson_inline simd8(const __m512i _value) : base8(_value) {} + // Splat constructor + simdjson_inline simd8(bool _value) : base8(splat(_value)) {} + simdjson_inline bool any() const { return !!_mm512_test_epi8_mask (*this, *this); } + simdjson_inline simd8 operator~() const { return *this ^ true; } + }; + + template + struct base8_numeric: base8 { + static simdjson_inline simd8 splat(T _value) { return _mm512_set1_epi8(_value); } + static simdjson_inline simd8 zero() { return _mm512_setzero_si512(); } + static simdjson_inline simd8 load(const T values[64]) { + return _mm512_loadu_si512(reinterpret_cast(values)); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdjson_inline simd8 repeat_16( + T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + simdjson_inline base8_numeric() : base8() {} + simdjson_inline base8_numeric(const __m512i _value) : base8(_value) {} + + // Store to array + simdjson_inline void store(T dst[64]) const { return _mm512_storeu_si512(reinterpret_cast<__m512i *>(dst), *this); } + + // Addition/subtraction are the same for signed and unsigned + simdjson_inline simd8 operator+(const simd8 other) const { return _mm512_add_epi8(*this, other); } + simdjson_inline simd8 operator-(const simd8 other) const { return _mm512_sub_epi8(*this, other); } + simdjson_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *static_cast*>(this); } + simdjson_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *static_cast*>(this); } + + // Override to distinguish from bool version + simdjson_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdjson_inline simd8 lookup_16(simd8 lookup_table) const { + return _mm512_shuffle_epi8(lookup_table, *this); + } + + // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). + // Passing a 0 value for mask would be equivalent to writing out every byte to output. + // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes + // get written. + // Design consideration: it seems like a function with the + // signature simd8 compress(uint32_t mask) would be + // sensible, but the AVX ISA makes this kind of approach difficult. + template + simdjson_inline void compress(uint64_t mask, L * output) const { + _mm512_mask_compressstoreu_epi8 (output,~mask,*this); + } + + template + simdjson_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + }; + + // Signed bytes + template<> + struct simd8 : base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m512i _value) : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const int8_t values[64]) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, + int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23, + int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31, + int8_t v32, int8_t v33, int8_t v34, int8_t v35, int8_t v36, int8_t v37, int8_t v38, int8_t v39, + int8_t v40, int8_t v41, int8_t v42, int8_t v43, int8_t v44, int8_t v45, int8_t v46, int8_t v47, + int8_t v48, int8_t v49, int8_t v50, int8_t v51, int8_t v52, int8_t v53, int8_t v54, int8_t v55, + int8_t v56, int8_t v57, int8_t v58, int8_t v59, int8_t v60, int8_t v61, int8_t v62, int8_t v63 + ) : simd8(_mm512_set_epi8( + v63, v62, v61, v60, v59, v58, v57, v56, + v55, v54, v53, v52, v51, v50, v49, v48, + v47, v46, v45, v44, v43, v42, v41, v40, + v39, v38, v37, v36, v35, v34, v33, v32, + v31, v30, v29, v28, v27, v26, v25, v24, + v23, v22, v21, v20, v19, v18, v17, v16, + v15, v14, v13, v12, v11, v10, v9, v8, + v7, v6, v5, v4, v3, v2, v1, v0 + )) {} + + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Order-sensitive comparisons + simdjson_inline simd8 max_val(const simd8 other) const { return _mm512_max_epi8(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return _mm512_min_epi8(*this, other); } + + simdjson_inline simd8 operator>(const simd8 other) const { return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(*this, other),_mm512_set1_epi8(uint8_t(0x80))); } + simdjson_inline simd8 operator<(const simd8 other) const { return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(other, *this),_mm512_set1_epi8(uint8_t(0x80))); } + }; + + // Unsigned bytes + template<> + struct simd8: base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m512i _value) : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const uint8_t values[64]) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15, + uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23, + uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31, + uint8_t v32, uint8_t v33, uint8_t v34, uint8_t v35, uint8_t v36, uint8_t v37, uint8_t v38, uint8_t v39, + uint8_t v40, uint8_t v41, uint8_t v42, uint8_t v43, uint8_t v44, uint8_t v45, uint8_t v46, uint8_t v47, + uint8_t v48, uint8_t v49, uint8_t v50, uint8_t v51, uint8_t v52, uint8_t v53, uint8_t v54, uint8_t v55, + uint8_t v56, uint8_t v57, uint8_t v58, uint8_t v59, uint8_t v60, uint8_t v61, uint8_t v62, uint8_t v63 + ) : simd8(_mm512_set_epi8( + v63, v62, v61, v60, v59, v58, v57, v56, + v55, v54, v53, v52, v51, v50, v49, v48, + v47, v46, v45, v44, v43, v42, v41, v40, + v39, v38, v37, v36, v35, v34, v33, v32, + v31, v30, v29, v28, v27, v26, v25, v24, + v23, v22, v21, v20, v19, v18, v17, v16, + v15, v14, v13, v12, v11, v10, v9, v8, + v7, v6, v5, v4, v3, v2, v1, v0 + )) {} + + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Saturated math + simdjson_inline simd8 saturating_add(const simd8 other) const { return _mm512_adds_epu8(*this, other); } + simdjson_inline simd8 saturating_sub(const simd8 other) const { return _mm512_subs_epu8(*this, other); } + + // Order-specific operations + simdjson_inline simd8 max_val(const simd8 other) const { return _mm512_max_epu8(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return _mm512_min_epu8(other, *this); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } + simdjson_inline uint64_t operator<=(const simd8 other) const { return other.max_val(*this) == other; } + simdjson_inline uint64_t operator>=(const simd8 other) const { return other.min_val(*this) == other; } + simdjson_inline simd8 operator>(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + simdjson_inline simd8 operator<(const simd8 other) const { return this->lt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdjson_inline simd8 bits_not_set() const { return _mm512_mask_blend_epi8(*this == uint8_t(0), _mm512_set1_epi8(0), _mm512_set1_epi8(-1)); } + simdjson_inline simd8 bits_not_set(simd8 bits) const { return (*this & bits).bits_not_set(); } + simdjson_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } + simdjson_inline simd8 any_bits_set(simd8 bits) const { return ~this->bits_not_set(bits); } + + simdjson_inline bool is_ascii() const { return _mm512_movepi8_mask(*this) == 0; } + simdjson_inline bool bits_not_set_anywhere() const { + return !_mm512_test_epi8_mask(*this, *this); + } + simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdjson_inline bool bits_not_set_anywhere(simd8 bits) const { return !_mm512_test_epi8_mask(*this, bits); } + simdjson_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } + template + simdjson_inline simd8 shr() const { return simd8(_mm512_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } + template + simdjson_inline simd8 shl() const { return simd8(_mm512_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + simdjson_inline uint64_t get_bit() const { return _mm512_movepi8_mask(_mm512_slli_epi16(*this, 7-N)); } + }; + + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 1, "Icelake kernel should use one register per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8& other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdjson_inline simd8x64(const simd8 chunk0, const simd8 chunk1) : chunks{chunk0, chunk1} {} + simdjson_inline simd8x64(const simd8 chunk0) : chunks{chunk0} {} + simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr)} {} + + simdjson_inline uint64_t compress(uint64_t mask, T * output) const { + this->chunks[0].compress(mask, output); + return 64 - count_ones(mask); + } + + simdjson_inline void store(T ptr[64]) const { + this->chunks[0].store(ptr+sizeof(simd8)*0); + } + + simdjson_inline simd8 reduce_or() const { + return this->chunks[0]; + } + + simdjson_inline simd8x64 bit_or(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] | mask + ); + } + + simdjson_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return this->chunks[0] == mask; + } + + simdjson_inline uint64_t eq(const simd8x64 &other) const { + return this->chunks[0] == other.chunks[0]; + } + + simdjson_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return this->chunks[0] <= mask; + } + }; // struct simd8x64 + +} // namespace simd + +} // unnamed namespace +} // namespace icelake +} // namespace simdjson + +#endif // SIMDJSON_ICELAKE_SIMD_H diff --git a/contrib/libs/simdjson/include/simdjson/icelake/stringparsing_defs.h b/contrib/libs/simdjson/include/simdjson/icelake/stringparsing_defs.h new file mode 100644 index 000000000000..4cc582737f25 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/icelake/stringparsing_defs.h @@ -0,0 +1,48 @@ +#ifndef SIMDJSON_ICELAKE_STRINGPARSING_DEFS_H +#define SIMDJSON_ICELAKE_STRINGPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/icelake/base.h" +#include "simdjson/icelake/simd.h" +#include "simdjson/icelake/bitmanipulation.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace icelake { +namespace { + +using namespace simd; + +// Holds backslashes and quotes locations. +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 64; + simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst); + + simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; } + simdjson_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; } + simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); } + simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); } + + uint64_t bs_bits; + uint64_t quote_bits; +}; // struct backslash_and_quote + +simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { + // this can read up to 15 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes"); + simd8 v(src); + // store to dest unconditionally - we can overwrite the bits we don't like later + v.store(dst); + return { + static_cast(v == '\\'), // bs_bits + static_cast(v == '"'), // quote_bits + }; +} + +} // unnamed namespace +} // namespace icelake +} // namespace simdjson + +#endif // SIMDJSON_ICELAKE_STRINGPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/implementation.h b/contrib/libs/simdjson/include/simdjson/implementation.h new file mode 100644 index 000000000000..19cb37162977 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/implementation.h @@ -0,0 +1,230 @@ +#ifndef SIMDJSON_IMPLEMENTATION_H +#define SIMDJSON_IMPLEMENTATION_H + +#include "simdjson/internal/atomic_ptr.h" +#include "simdjson/internal/dom_parser_implementation.h" + +#include + +namespace simdjson { + +/** + * Validate the UTF-8 string. + * + * @param buf the string to validate. + * @param len the length of the string in bytes. + * @return true if the string is valid UTF-8. + */ +simdjson_warn_unused bool validate_utf8(const char * buf, size_t len) noexcept; +/** + * Validate the UTF-8 string. + * + * @param sv the string_view to validate. + * @return true if the string is valid UTF-8. + */ +simdjson_inline simdjson_warn_unused bool validate_utf8(const std::string_view sv) noexcept { + return validate_utf8(sv.data(), sv.size()); +} + +/** + * Validate the UTF-8 string. + * + * @param p the string to validate. + * @return true if the string is valid UTF-8. + */ +simdjson_inline simdjson_warn_unused bool validate_utf8(const std::string& s) noexcept { + return validate_utf8(s.data(), s.size()); +} + +/** + * An implementation of simdjson for a particular CPU architecture. + * + * Also used to maintain the currently active implementation. The active implementation is + * automatically initialized on first use to the most advanced implementation supported by the host. + */ +class implementation { +public: + + /** + * The name of this implementation. + * + * const implementation *impl = simdjson::get_active_implementation(); + * cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; + * + * @return the name of the implementation, e.g. "haswell", "westmere", "arm64". + */ + virtual std::string name() const { return std::string(_name); } + + /** + * The description of this implementation. + * + * const implementation *impl = simdjson::get_active_implementation(); + * cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; + * + * @return the description of the implementation, e.g. "Intel/AMD AVX2", "Intel/AMD SSE4.2", "ARM NEON". + */ + virtual std::string description() const { return std::string(_description); } + + /** + * The instruction sets this implementation is compiled against + * and the current CPU match. This function may poll the current CPU/system + * and should therefore not be called too often if performance is a concern. + * + * @return true if the implementation can be safely used on the current system (determined at runtime). + */ + bool supported_by_runtime_system() const; + + /** + * @private For internal implementation use + * + * The instruction sets this implementation is compiled against. + * + * @return a mask of all required `internal::instruction_set::` values. + */ + virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; } + + /** + * @private For internal implementation use + * + * const implementation *impl = simdjson::get_active_implementation(); + * cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; + * + * @param capacity The largest document that will be passed to the parser. + * @param max_depth The maximum JSON object/array nesting this parser is expected to handle. + * @param dst The place to put the resulting parser implementation. + * @return the error code, or SUCCESS if there was no error. + */ + virtual error_code create_dom_parser_implementation( + size_t capacity, + size_t max_depth, + std::unique_ptr &dst + ) const noexcept = 0; + + /** + * @private For internal implementation use + * + * Minify the input string assuming that it represents a JSON string, does not parse or validate. + * + * Overridden by each implementation. + * + * @param buf the json document to minify. + * @param len the length of the json document. + * @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. + * @param dst_len the number of bytes written. Output only. + * @return the error code, or SUCCESS if there was no error. + */ + simdjson_warn_unused virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0; + + + /** + * Validate the UTF-8 string. + * + * Overridden by each implementation. + * + * @param buf the string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid UTF-8. + */ + simdjson_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0; + +protected: + /** @private Construct an implementation with the given name and description. For subclasses. */ + simdjson_inline implementation( + std::string_view name, + std::string_view description, + uint32_t required_instruction_sets + ) : + _name(name), + _description(description), + _required_instruction_sets(required_instruction_sets) + { + } +protected: + ~implementation() = default; + +private: + /** + * The name of this implementation. + */ + std::string_view _name; + + /** + * The description of this implementation. + */ + std::string_view _description; + + /** + * Instruction sets required for this implementation. + */ + const uint32_t _required_instruction_sets; +}; + +/** @private */ +namespace internal { + +/** + * The list of available implementations compiled into simdjson. + */ +class available_implementation_list { +public: + /** Get the list of available implementations compiled into simdjson */ + simdjson_inline available_implementation_list() {} + /** Number of implementations */ + size_t size() const noexcept; + /** STL const begin() iterator */ + const implementation * const *begin() const noexcept; + /** STL const end() iterator */ + const implementation * const *end() const noexcept; + + /** + * Get the implementation with the given name. + * + * Case sensitive. + * + * const implementation *impl = simdjson::get_available_implementations()["westmere"]; + * if (!impl) { exit(1); } + * if (!imp->supported_by_runtime_system()) { exit(1); } + * simdjson::get_active_implementation() = impl; + * + * @param name the implementation to find, e.g. "westmere", "haswell", "arm64" + * @return the implementation, or nullptr if the parse failed. + */ + const implementation * operator[](const std::string_view &name) const noexcept { + for (const implementation * impl : *this) { + if (impl->name() == name) { return impl; } + } + return nullptr; + } + + /** + * Detect the most advanced implementation supported by the current host. + * + * This is used to initialize the implementation on startup. + * + * const implementation *impl = simdjson::available_implementation::detect_best_supported(); + * simdjson::get_active_implementation() = impl; + * + * @return the most advanced supported implementation for the current host, or an + * implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported + * implementation. Will never return nullptr. + */ + const implementation *detect_best_supported() const noexcept; +}; + +} // namespace internal + +/** + * The list of available implementations compiled into simdjson. + */ +extern SIMDJSON_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations(); + +/** + * The active implementation. + * + * Automatically initialized on first use to the most advanced implementation supported by this hardware. + */ +extern SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr& get_active_implementation(); + +} // namespace simdjson + +#endif // SIMDJSON_IMPLEMENTATION_H diff --git a/contrib/libs/simdjson/include/simdjson/implementation_detection.h b/contrib/libs/simdjson/include/simdjson/implementation_detection.h new file mode 100644 index 000000000000..0ff315b7adb1 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/implementation_detection.h @@ -0,0 +1,168 @@ +#ifndef SIMDJSON_IMPLEMENTATION_DETECTION_H +#define SIMDJSON_IMPLEMENTATION_DETECTION_H + +#include "simdjson/base.h" + +// 0 is reserved, because undefined SIMDJSON_IMPLEMENTATION equals 0 in preprocessor macros. +#define SIMDJSON_IMPLEMENTATION_ID_arm64 1 +#define SIMDJSON_IMPLEMENTATION_ID_fallback 2 +#define SIMDJSON_IMPLEMENTATION_ID_haswell 3 +#define SIMDJSON_IMPLEMENTATION_ID_icelake 4 +#define SIMDJSON_IMPLEMENTATION_ID_ppc64 5 +#define SIMDJSON_IMPLEMENTATION_ID_westmere 6 +#define SIMDJSON_IMPLEMENTATION_ID_lsx 7 +#define SIMDJSON_IMPLEMENTATION_ID_lasx 8 + +#define SIMDJSON_IMPLEMENTATION_ID_FOR(IMPL) SIMDJSON_CAT(SIMDJSON_IMPLEMENTATION_ID_, IMPL) +#define SIMDJSON_IMPLEMENTATION_ID SIMDJSON_IMPLEMENTATION_ID_FOR(SIMDJSON_IMPLEMENTATION) + +#define SIMDJSON_IMPLEMENTATION_IS(IMPL) SIMDJSON_IMPLEMENTATION_ID == SIMDJSON_IMPLEMENTATION_ID_FOR(IMPL) + +// +// First, figure out which implementations can be run. Doing it here makes it so we don't have to worry about the order +// in which we include them. +// + +#ifndef SIMDJSON_IMPLEMENTATION_ARM64 +#define SIMDJSON_IMPLEMENTATION_ARM64 (SIMDJSON_IS_ARM64) +#endif +#if SIMDJSON_IMPLEMENTATION_ARM64 && SIMDJSON_IS_ARM64 +#define SIMDJSON_CAN_ALWAYS_RUN_ARM64 1 +#else +#define SIMDJSON_CAN_ALWAYS_RUN_ARM64 0 +#endif + +// Default Icelake to on if this is x86-64. Even if we're not compiled for it, it could be selected +// at runtime. +#ifndef SIMDJSON_IMPLEMENTATION_ICELAKE +#define SIMDJSON_IMPLEMENTATION_ICELAKE ((SIMDJSON_IS_X86_64) && (SIMDJSON_AVX512_ALLOWED) && (SIMDJSON_COMPILER_SUPPORTS_VBMI2)) +#endif + +#ifdef _MSC_VER +// To see why (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see +// https://github.com/simdjson/simdjson/issues/1247 +#if ((SIMDJSON_IMPLEMENTATION_ICELAKE) && (__AVX2__) && (__AVX512F__) && (__AVX512DQ__) && (__AVX512CD__) && (__AVX512BW__) && (__AVX512VL__) && (__AVX512VBMI2__)) +#define SIMDJSON_CAN_ALWAYS_RUN_ICELAKE 1 +#else +#define SIMDJSON_CAN_ALWAYS_RUN_ICELAKE 0 +#endif + +#else + +#if ((SIMDJSON_IMPLEMENTATION_ICELAKE) && (__AVX2__) && (__BMI__) && (__PCLMUL__) && (__LZCNT__) && (__AVX512F__) && (__AVX512DQ__) && (__AVX512CD__) && (__AVX512BW__) && (__AVX512VL__) && (__AVX512VBMI2__)) +#define SIMDJSON_CAN_ALWAYS_RUN_ICELAKE 1 +#else +#define SIMDJSON_CAN_ALWAYS_RUN_ICELAKE 0 +#endif + +#endif + +// Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected +// at runtime. +#ifndef SIMDJSON_IMPLEMENTATION_HASWELL +#if SIMDJSON_CAN_ALWAYS_RUN_ICELAKE +// if icelake is always available, never enable haswell. +#define SIMDJSON_IMPLEMENTATION_HASWELL 0 +#else +#define SIMDJSON_IMPLEMENTATION_HASWELL SIMDJSON_IS_X86_64 +#endif +#endif +#ifdef _MSC_VER +// To see why (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see +// https://github.com/simdjson/simdjson/issues/1247 +#if ((SIMDJSON_IMPLEMENTATION_HASWELL) && (SIMDJSON_IS_X86_64) && (__AVX2__)) +#define SIMDJSON_CAN_ALWAYS_RUN_HASWELL 1 +#else +#define SIMDJSON_CAN_ALWAYS_RUN_HASWELL 0 +#endif + +#else + +#if ((SIMDJSON_IMPLEMENTATION_HASWELL) && (SIMDJSON_IS_X86_64) && (__AVX2__) && (__BMI__) && (__PCLMUL__) && (__LZCNT__)) +#define SIMDJSON_CAN_ALWAYS_RUN_HASWELL 1 +#else +#define SIMDJSON_CAN_ALWAYS_RUN_HASWELL 0 +#endif + +#endif + +// Default Westmere to on if this is x86-64. +#ifndef SIMDJSON_IMPLEMENTATION_WESTMERE +#if SIMDJSON_CAN_ALWAYS_RUN_ICELAKE || SIMDJSON_CAN_ALWAYS_RUN_HASWELL +// if icelake or haswell are always available, never enable westmere. +#define SIMDJSON_IMPLEMENTATION_WESTMERE 0 +#else +#define SIMDJSON_IMPLEMENTATION_WESTMERE SIMDJSON_IS_X86_64 +#endif +#endif + +#if (SIMDJSON_IMPLEMENTATION_WESTMERE && SIMDJSON_IS_X86_64 && __SSE4_2__ && __PCLMUL__) +#define SIMDJSON_CAN_ALWAYS_RUN_WESTMERE 1 +#else +#define SIMDJSON_CAN_ALWAYS_RUN_WESTMERE 0 +#endif + + +#ifndef SIMDJSON_IMPLEMENTATION_PPC64 +#define SIMDJSON_IMPLEMENTATION_PPC64 (SIMDJSON_IS_PPC64 && SIMDJSON_IS_PPC64_VMX) +#endif +#if SIMDJSON_IMPLEMENTATION_PPC64 && SIMDJSON_IS_PPC64 && SIMDJSON_IS_PPC64_VMX +#define SIMDJSON_CAN_ALWAYS_RUN_PPC64 1 +#else +#define SIMDJSON_CAN_ALWAYS_RUN_PPC64 0 +#endif + +#ifndef SIMDJSON_IMPLEMENTATION_LASX +#define SIMDJSON_IMPLEMENTATION_LASX (SIMDJSON_IS_LOONGARCH64 && __loongarch_asx) +#endif +#define SIMDJSON_CAN_ALWAYS_RUN_LASX (SIMDJSON_IMPLEMENTATION_LASX) + +#ifndef SIMDJSON_IMPLEMENTATION_LSX +#if SIMDJSON_CAN_ALWAYS_RUN_LASX +#define SIMDJSON_IMPLEMENTATION_LSX 0 +#else +#define SIMDJSON_IMPLEMENTATION_LSX (SIMDJSON_IS_LOONGARCH64 && __loongarch_sx) +#endif +#endif +#define SIMDJSON_CAN_ALWAYS_RUN_LSX (SIMDJSON_IMPLEMENTATION_LSX) + +// Default Fallback to on unless a builtin implementation has already been selected. +#ifndef SIMDJSON_IMPLEMENTATION_FALLBACK +#if SIMDJSON_CAN_ALWAYS_RUN_ARM64 || SIMDJSON_CAN_ALWAYS_RUN_ICELAKE || SIMDJSON_CAN_ALWAYS_RUN_HASWELL || SIMDJSON_CAN_ALWAYS_RUN_WESTMERE || SIMDJSON_CAN_ALWAYS_RUN_PPC64 || SIMDJSON_CAN_ALWAYS_RUN_LSX || SIMDJSON_CAN_ALWAYS_RUN_LASX +// if anything at all except fallback can always run, then disable fallback. +#define SIMDJSON_IMPLEMENTATION_FALLBACK 0 +#else +#define SIMDJSON_IMPLEMENTATION_FALLBACK 1 +#endif +#endif +#define SIMDJSON_CAN_ALWAYS_RUN_FALLBACK SIMDJSON_IMPLEMENTATION_FALLBACK + +// Determine the best builtin implementation +#ifndef SIMDJSON_BUILTIN_IMPLEMENTATION + +#if SIMDJSON_CAN_ALWAYS_RUN_ICELAKE +#define SIMDJSON_BUILTIN_IMPLEMENTATION icelake +#elif SIMDJSON_CAN_ALWAYS_RUN_HASWELL +#define SIMDJSON_BUILTIN_IMPLEMENTATION haswell +#elif SIMDJSON_CAN_ALWAYS_RUN_WESTMERE +#define SIMDJSON_BUILTIN_IMPLEMENTATION westmere +#elif SIMDJSON_CAN_ALWAYS_RUN_ARM64 +#define SIMDJSON_BUILTIN_IMPLEMENTATION arm64 +#elif SIMDJSON_CAN_ALWAYS_RUN_PPC64 +#define SIMDJSON_BUILTIN_IMPLEMENTATION ppc64 +#elif SIMDJSON_CAN_ALWAYS_RUN_LSX +#define SIMDJSON_BUILTIN_IMPLEMENTATION lsx +#elif SIMDJSON_CAN_ALWAYS_RUN_LASX +#define SIMDJSON_BUILTIN_IMPLEMENTATION lasx +#elif SIMDJSON_CAN_ALWAYS_RUN_FALLBACK +#define SIMDJSON_BUILTIN_IMPLEMENTATION fallback +#else +#error "All possible implementations (including fallback) have been disabled! simdjson will not run." +#endif + +#endif // SIMDJSON_BUILTIN_IMPLEMENTATION + +#define SIMDJSON_BUILTIN_IMPLEMENTATION_ID SIMDJSON_IMPLEMENTATION_ID_FOR(SIMDJSON_BUILTIN_IMPLEMENTATION) +#define SIMDJSON_BUILTIN_IMPLEMENTATION_IS(IMPL) SIMDJSON_BUILTIN_IMPLEMENTATION_ID == SIMDJSON_IMPLEMENTATION_ID_FOR(IMPL) + +#endif // SIMDJSON_IMPLEMENTATION_DETECTION_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/internal/atomic_ptr.h b/contrib/libs/simdjson/include/simdjson/internal/atomic_ptr.h new file mode 100644 index 000000000000..c4fe41b05a66 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/internal/atomic_ptr.h @@ -0,0 +1,31 @@ +#ifndef SIMDJSON_INTERNAL_ATOMIC_PTR_H +#define SIMDJSON_INTERNAL_ATOMIC_PTR_H + +#include "simdjson/base.h" +#include + +namespace simdjson { +namespace internal { + +template +class atomic_ptr { +public: + atomic_ptr(T *_ptr) : ptr{_ptr} {} + + operator const T*() const { return ptr.load(); } + const T& operator*() const { return *ptr; } + const T* operator->() const { return ptr.load(); } + + operator T*() { return ptr.load(); } + T& operator*() { return *ptr; } + T* operator->() { return ptr.load(); } + atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; } + +private: + std::atomic ptr; +}; + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_INTERNAL_ATOMIC_PTR_H diff --git a/contrib/libs/simdjson/include/simdjson/internal/dom_parser_implementation.h b/contrib/libs/simdjson/include/simdjson/internal/dom_parser_implementation.h new file mode 100644 index 000000000000..a93fe38ff9e0 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/internal/dom_parser_implementation.h @@ -0,0 +1,252 @@ +#ifndef SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H +#define SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H + +#include "simdjson/base.h" +#include "simdjson/error.h" +#include + +namespace simdjson { + +namespace dom { +class document; +} // namespace dom + +/** +* This enum is used with the dom_parser_implementation::stage1 function. +* 1) The regular mode expects a fully formed JSON document. +* 2) The streaming_partial mode expects a possibly truncated +* input within a stream on JSON documents. +* 3) The stream_final mode allows us to truncate final +* unterminated strings. It is useful in conjunction with streaming_partial. +*/ +enum class stage1_mode { regular, streaming_partial, streaming_final}; + +/** + * Returns true if mode == streaming_partial or mode == streaming_final + */ +inline bool is_streaming(stage1_mode mode) { + // performance note: it is probably faster to check that mode is different + // from regular than checking that it is either streaming_partial or streaming_final. + return (mode != stage1_mode::regular); + // return (mode == stage1_mode::streaming_partial || mode == stage1_mode::streaming_final); +} + + +namespace internal { + + +/** + * An implementation of simdjson's DOM parser for a particular CPU architecture. + * + * This class is expected to be accessed only by pointer, and never move in memory (though the + * pointer can move). + */ +class dom_parser_implementation { +public: + + /** + * @private For internal implementation use + * + * Run a full JSON parse on a single document (stage1 + stage2). + * + * Guaranteed only to be called when capacity > document length. + * + * Overridden by each implementation. + * + * @param buf The json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. + * @param len The length of the json document. + * @return The error code, or SUCCESS if there was no error. + */ + simdjson_warn_unused virtual error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept = 0; + + /** + * @private For internal implementation use + * + * Stage 1 of the document parser. + * + * Guaranteed only to be called when capacity > document length. + * + * Overridden by each implementation. + * + * @param buf The json document to parse. + * @param len The length of the json document. + * @param streaming Whether this is being called by parser::parse_many. + * @return The error code, or SUCCESS if there was no error. + */ + simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, stage1_mode streaming) noexcept = 0; + + /** + * @private For internal implementation use + * + * Stage 2 of the document parser. + * + * Called after stage1(). + * + * Overridden by each implementation. + * + * @param doc The document to output to. + * @return The error code, or SUCCESS if there was no error. + */ + simdjson_warn_unused virtual error_code stage2(dom::document &doc) noexcept = 0; + + /** + * @private For internal implementation use + * + * Stage 2 of the document parser for parser::parse_many. + * + * Guaranteed only to be called after stage1(). + * Overridden by each implementation. + * + * @param doc The document to output to. + * @return The error code, SUCCESS if there was no error, or EMPTY if all documents have been parsed. + */ + simdjson_warn_unused virtual error_code stage2_next(dom::document &doc) noexcept = 0; + + /** + * Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There + * must be an unescaped quote terminating the string. It returns the final output + * position as pointer. In case of error (e.g., the string has bad escaped codes), + * then null_ptr is returned. It is assumed that the output buffer is large + * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes + + * SIMDJSON_PADDING bytes. + * + * Overridden by each implementation. + * + * @param str pointer to the beginning of a valid UTF-8 JSON string, must end with an unescaped quote. + * @param dst pointer to a destination buffer, it must point a region in memory of sufficient size. + * @param allow_replacement whether we allow a replacement character when the UTF-8 contains unmatched surrogate pairs. + * @return end of the of the written region (exclusive) or nullptr in case of error. + */ + simdjson_warn_unused virtual uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept = 0; + + /** + * Unescape a NON-valid UTF-8 string from src to dst, stopping at a final unescaped quote. There + * must be an unescaped quote terminating the string. It returns the final output + * position as pointer. In case of error (e.g., the string has bad escaped codes), + * then null_ptr is returned. It is assumed that the output buffer is large + * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes + + * SIMDJSON_PADDING bytes. + * + * Overridden by each implementation. + * + * @param str pointer to the beginning of a possibly invalid UTF-8 JSON string, must end with an unescaped quote. + * @param dst pointer to a destination buffer, it must point a region in memory of sufficient size. + * @return end of the of the written region (exclusive) or nullptr in case of error. + */ + simdjson_warn_unused virtual uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept = 0; + + /** + * Change the capacity of this parser. + * + * The capacity can never exceed SIMDJSON_MAXSIZE_BYTES (e.g., 4 GB) + * and an CAPACITY error is returned if it is attempted. + * + * Generally used for reallocation. + * + * @param capacity The new capacity. + * @param max_depth The new max_depth. + * @return The error code, or SUCCESS if there was no error. + */ + virtual error_code set_capacity(size_t capacity) noexcept = 0; + + /** + * Change the max depth of this parser. + * + * Generally used for reallocation. + * + * @param capacity The new capacity. + * @param max_depth The new max_depth. + * @return The error code, or SUCCESS if there was no error. + */ + virtual error_code set_max_depth(size_t max_depth) noexcept = 0; + + /** + * Deallocate this parser. + */ + virtual ~dom_parser_implementation() = default; + + /** Number of structural indices passed from stage 1 to stage 2 */ + uint32_t n_structural_indexes{0}; + /** Structural indices passed from stage 1 to stage 2 */ + std::unique_ptr structural_indexes{}; + /** Next structural index to parse */ + uint32_t next_structural_index{0}; + + /** + * The largest document this parser can support without reallocating. + * + * @return Current capacity, in bytes. + */ + simdjson_pure simdjson_inline size_t capacity() const noexcept; + + /** + * The maximum level of nested object and arrays supported by this parser. + * + * @return Maximum depth, in bytes. + */ + simdjson_pure simdjson_inline size_t max_depth() const noexcept; + + /** + * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length + * and `max_depth` depth. + * + * @param capacity The new capacity. + * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. + * @return The error, if there is one. + */ + simdjson_warn_unused inline error_code allocate(size_t capacity, size_t max_depth) noexcept; + + +protected: + /** + * The maximum document length this parser supports. + * + * Buffers are large enough to handle any document up to this length. + */ + size_t _capacity{0}; + + /** + * The maximum depth (number of nested objects and arrays) supported by this parser. + * + * Defaults to DEFAULT_MAX_DEPTH. + */ + size_t _max_depth{0}; + + // Declaring these so that subclasses can use them to implement their constructors. + simdjson_inline dom_parser_implementation() noexcept; + simdjson_inline dom_parser_implementation(dom_parser_implementation &&other) noexcept; + simdjson_inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept; + + simdjson_inline dom_parser_implementation(const dom_parser_implementation &) noexcept = delete; + simdjson_inline dom_parser_implementation &operator=(const dom_parser_implementation &other) noexcept = delete; +}; // class dom_parser_implementation + +simdjson_inline dom_parser_implementation::dom_parser_implementation() noexcept = default; +simdjson_inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default; +simdjson_inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default; + +simdjson_pure simdjson_inline size_t dom_parser_implementation::capacity() const noexcept { + return _capacity; +} + +simdjson_pure simdjson_inline size_t dom_parser_implementation::max_depth() const noexcept { + return _max_depth; +} + +simdjson_warn_unused +inline error_code dom_parser_implementation::allocate(size_t capacity, size_t max_depth) noexcept { + if (this->max_depth() != max_depth) { + error_code err = set_max_depth(max_depth); + if (err) { return err; } + } + if (_capacity != capacity) { + error_code err = set_capacity(capacity); + if (err) { return err; } + } + return SUCCESS; +} + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H diff --git a/contrib/libs/simdjson/include/simdjson/internal/instruction_set.h b/contrib/libs/simdjson/include/simdjson/internal/instruction_set.h new file mode 100644 index 000000000000..1dc0a81fb3c5 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/internal/instruction_set.h @@ -0,0 +1,77 @@ +/* From +https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h +Highly modified. + +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, +Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute +(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, +Samy Bengio, Johnny Mariethoz) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories +America and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef SIMDJSON_INTERNAL_INSTRUCTION_SET_H +#define SIMDJSON_INTERNAL_INSTRUCTION_SET_H + +namespace simdjson { +namespace internal { + +enum instruction_set { + DEFAULT = 0x0, + NEON = 0x1, + AVX2 = 0x4, + SSE42 = 0x8, + PCLMULQDQ = 0x10, + BMI1 = 0x20, + BMI2 = 0x40, + ALTIVEC = 0x80, + AVX512F = 0x100, + AVX512DQ = 0x200, + AVX512IFMA = 0x400, + AVX512PF = 0x800, + AVX512ER = 0x1000, + AVX512CD = 0x2000, + AVX512BW = 0x4000, + AVX512VL = 0x8000, + AVX512VBMI2 = 0x10000, + LSX = 0x20000, + LASX = 0x40000, +}; + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_INTERNAL_INSTRUCTION_SET_H diff --git a/contrib/libs/simdjson/include/simdjson/internal/jsoncharutils_tables.h b/contrib/libs/simdjson/include/simdjson/internal/jsoncharutils_tables.h new file mode 100644 index 000000000000..d72bce122622 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/internal/jsoncharutils_tables.h @@ -0,0 +1,26 @@ +#ifndef SIMDJSON_INTERNAL_JSONCHARUTILS_TABLES_H +#define SIMDJSON_INTERNAL_JSONCHARUTILS_TABLES_H + +#include "simdjson/base.h" + +#ifdef JSON_TEST_STRINGS +void found_string(const uint8_t *buf, const uint8_t *parsed_begin, + const uint8_t *parsed_end); +void found_bad_string(const uint8_t *buf); +#endif + +namespace simdjson { +namespace internal { +// structural chars here are +// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL) +// we are also interested in the four whitespace characters +// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d + +extern SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace_negated[256]; +extern SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace[256]; +extern SIMDJSON_DLLIMPORTEXPORT const uint32_t digit_to_val32[886]; + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_INTERNAL_JSONCHARUTILS_TABLES_H diff --git a/contrib/libs/simdjson/include/simdjson/internal/jsonformatutils.h b/contrib/libs/simdjson/include/simdjson/internal/jsonformatutils.h new file mode 100644 index 000000000000..43000ab43183 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/internal/jsonformatutils.h @@ -0,0 +1,64 @@ +#ifndef SIMDJSON_INTERNAL_JSONFORMATUTILS_H +#define SIMDJSON_INTERNAL_JSONFORMATUTILS_H + +#include "simdjson/base.h" +#include +#include +#include + +namespace simdjson { +namespace internal { + +inline std::ostream& operator<<(std::ostream& out, const escape_json_string &str); + +class escape_json_string { +public: + escape_json_string(std::string_view _str) noexcept : str{_str} {} + operator std::string() const noexcept { std::stringstream s; s << *this; return s.str(); } +private: + std::string_view str; + friend std::ostream& operator<<(std::ostream& out, const escape_json_string &unescaped); +}; + +inline std::ostream& operator<<(std::ostream& out, const escape_json_string &unescaped) { + for (size_t i=0; i(unescaped.str[i]) <= 0x1F) { + // TODO can this be done once at the beginning, or will it mess up << char? + std::ios::fmtflags f(out.flags()); + out << "\\u" << std::hex << std::setw(4) << std::setfill('0') << int(unescaped.str[i]); + out.flags(f); + } else { + out << unescaped.str[i]; + } + } + } + return out; +} + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_INTERNAL_JSONFORMATUTILS_H diff --git a/contrib/libs/simdjson/include/simdjson/internal/numberparsing_tables.h b/contrib/libs/simdjson/include/simdjson/internal/numberparsing_tables.h new file mode 100644 index 000000000000..1762056f7506 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/internal/numberparsing_tables.h @@ -0,0 +1,59 @@ +#ifndef SIMDJSON_INTERNAL_NUMBERPARSING_TABLES_H +#define SIMDJSON_INTERNAL_NUMBERPARSING_TABLES_H + +#include "simdjson/base.h" + +namespace simdjson { +namespace internal { +/** + * The smallest non-zero float (binary64) is 2^-1074. + * We take as input numbers of the form w x 10^q where w < 2^64. + * We have that w * 10^-343 < 2^(64-344) 5^-343 < 2^-1076. + * However, we have that + * (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^-1074. + * Thus it is possible for a number of the form w * 10^-342 where + * w is a 64-bit value to be a non-zero floating-point number. + ********* + * Any number of form w * 10^309 where w>= 1 is going to be + * infinite in binary64 so we never need to worry about powers + * of 5 greater than 308. + */ +constexpr int smallest_power = -342; +constexpr int largest_power = 308; + +/** + * Represents a 128-bit value. + * low: least significant 64 bits. + * high: most significant 64 bits. + */ +struct value128 { + uint64_t low; + uint64_t high; +}; + + +// Precomputed powers of ten from 10^0 to 10^22. These +// can be represented exactly using the double type. +extern SIMDJSON_DLLIMPORTEXPORT const double power_of_ten[]; + + +/** + * When mapping numbers from decimal to binary, + * we go from w * 10^q to m * 2^p but we have + * 10^q = 5^q * 2^q, so effectively + * we are trying to match + * w * 2^q * 5^q to m * 2^p. Thus the powers of two + * are not a concern since they can be represented + * exactly using the binary notation, only the powers of five + * affect the binary significand. + */ + + +// The truncated powers of five from 5^-342 all the way to 5^308 +// The mantissa is truncated to 128 bits, and +// never rounded up. Uses about 10KB. +extern SIMDJSON_DLLIMPORTEXPORT const uint64_t power_of_five_128[]; +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_INTERNAL_NUMBERPARSING_TABLES_H diff --git a/contrib/libs/simdjson/include/simdjson/internal/simdprune_tables.h b/contrib/libs/simdjson/include/simdjson/internal/simdprune_tables.h new file mode 100644 index 000000000000..7b8f4650c15c --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/internal/simdprune_tables.h @@ -0,0 +1,21 @@ +#ifndef SIMDJSON_INTERNAL_SIMDPRUNE_TABLES_H +#define SIMDJSON_INTERNAL_SIMDPRUNE_TABLES_H + +#include "simdjson/base.h" + +#include + +namespace simdjson { // table modified and copied from +namespace internal { // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable + +extern SIMDJSON_DLLIMPORTEXPORT const unsigned char BitsSetTable256mul2[256]; + +extern SIMDJSON_DLLIMPORTEXPORT const uint8_t pshufb_combine_table[272]; + +// 256 * 8 bytes = 2kB, easily fits in cache. +extern SIMDJSON_DLLIMPORTEXPORT const uint64_t thintable_epi8[256]; + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_INTERNAL_SIMDPRUNE_TABLES_H diff --git a/contrib/libs/simdjson/include/simdjson/internal/tape_ref-inl.h b/contrib/libs/simdjson/include/simdjson/internal/tape_ref-inl.h new file mode 100644 index 000000000000..a100d569e2b1 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/internal/tape_ref-inl.h @@ -0,0 +1,118 @@ +#ifndef SIMDJSON_TAPE_REF_INL_H +#define SIMDJSON_TAPE_REF_INL_H + +#include "simdjson/dom/document.h" +#include "simdjson/internal/tape_ref.h" +#include "simdjson/internal/tape_type.h" + +#include + +namespace simdjson { +namespace internal { + +constexpr const uint64_t JSON_VALUE_MASK = 0x00FFFFFFFFFFFFFF; +constexpr const uint32_t JSON_COUNT_MASK = 0xFFFFFF; + +// +// tape_ref inline implementation +// +simdjson_inline tape_ref::tape_ref() noexcept : doc{nullptr}, json_index{0} {} +simdjson_inline tape_ref::tape_ref(const dom::document *_doc, size_t _json_index) noexcept : doc{_doc}, json_index{_json_index} {} + + +simdjson_inline bool tape_ref::is_document_root() const noexcept { + return json_index == 1; // should we ever change the structure of the tape, this should get updated. +} +simdjson_inline bool tape_ref::usable() const noexcept { + return doc != nullptr; // when the document pointer is null, this tape_ref is uninitialized (should not be accessed). +} +// Some value types have a specific on-tape word value. It can be faster +// to check the type by doing a word-to-word comparison instead of extracting the +// most significant 8 bits. + +simdjson_inline bool tape_ref::is_double() const noexcept { + constexpr uint64_t tape_double = uint64_t(tape_type::DOUBLE)<<56; + return doc->tape[json_index] == tape_double; +} +simdjson_inline bool tape_ref::is_int64() const noexcept { + constexpr uint64_t tape_int64 = uint64_t(tape_type::INT64)<<56; + return doc->tape[json_index] == tape_int64; +} +simdjson_inline bool tape_ref::is_uint64() const noexcept { + constexpr uint64_t tape_uint64 = uint64_t(tape_type::UINT64)<<56; + return doc->tape[json_index] == tape_uint64; +} +simdjson_inline bool tape_ref::is_false() const noexcept { + constexpr uint64_t tape_false = uint64_t(tape_type::FALSE_VALUE)<<56; + return doc->tape[json_index] == tape_false; +} +simdjson_inline bool tape_ref::is_true() const noexcept { + constexpr uint64_t tape_true = uint64_t(tape_type::TRUE_VALUE)<<56; + return doc->tape[json_index] == tape_true; +} +simdjson_inline bool tape_ref::is_null_on_tape() const noexcept { + constexpr uint64_t tape_null = uint64_t(tape_type::NULL_VALUE)<<56; + return doc->tape[json_index] == tape_null; +} + +inline size_t tape_ref::after_element() const noexcept { + switch (tape_ref_type()) { + case tape_type::START_ARRAY: + case tape_type::START_OBJECT: + return matching_brace_index(); + case tape_type::UINT64: + case tape_type::INT64: + case tape_type::DOUBLE: + return json_index + 2; + default: + return json_index + 1; + } +} +simdjson_inline tape_type tape_ref::tape_ref_type() const noexcept { + return static_cast(doc->tape[json_index] >> 56); +} +simdjson_inline uint64_t internal::tape_ref::tape_value() const noexcept { + return doc->tape[json_index] & internal::JSON_VALUE_MASK; +} +simdjson_inline uint32_t internal::tape_ref::matching_brace_index() const noexcept { + return uint32_t(doc->tape[json_index]); +} +simdjson_inline uint32_t internal::tape_ref::scope_count() const noexcept { + return uint32_t((doc->tape[json_index] >> 32) & internal::JSON_COUNT_MASK); +} + +template +simdjson_inline T tape_ref::next_tape_value() const noexcept { + static_assert(sizeof(T) == sizeof(uint64_t), "next_tape_value() template parameter must be 64-bit"); + // Though the following is tempting... + // return *reinterpret_cast(&doc->tape[json_index + 1]); + // It is not generally safe. It is safer, and often faster to rely + // on memcpy. Yes, it is uglier, but it is also encapsulated. + T x; + std::memcpy(&x,&doc->tape[json_index + 1],sizeof(uint64_t)); + return x; +} + +simdjson_inline uint32_t internal::tape_ref::get_string_length() const noexcept { + size_t string_buf_index = size_t(tape_value()); + uint32_t len; + std::memcpy(&len, &doc->string_buf[string_buf_index], sizeof(len)); + return len; +} + +simdjson_inline const char * internal::tape_ref::get_c_str() const noexcept { + size_t string_buf_index = size_t(tape_value()); + return reinterpret_cast(&doc->string_buf[string_buf_index + sizeof(uint32_t)]); +} + +inline std::string_view internal::tape_ref::get_string_view() const noexcept { + return std::string_view( + get_c_str(), + get_string_length() + ); +} + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_TAPE_REF_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/internal/tape_ref.h b/contrib/libs/simdjson/include/simdjson/internal/tape_ref.h new file mode 100644 index 000000000000..922a05701022 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/internal/tape_ref.h @@ -0,0 +1,49 @@ +#ifndef SIMDJSON_INTERNAL_TAPE_REF_H +#define SIMDJSON_INTERNAL_TAPE_REF_H + +#include "simdjson/base.h" + +namespace simdjson { +namespace dom { +class document; +} // namespace dom + +namespace internal { + +/** + * A reference to an element on the tape. Internal only. + */ +class tape_ref { +public: + simdjson_inline tape_ref() noexcept; + simdjson_inline tape_ref(const dom::document *doc, size_t json_index) noexcept; + inline size_t after_element() const noexcept; + simdjson_inline tape_type tape_ref_type() const noexcept; + simdjson_inline uint64_t tape_value() const noexcept; + simdjson_inline bool is_double() const noexcept; + simdjson_inline bool is_int64() const noexcept; + simdjson_inline bool is_uint64() const noexcept; + simdjson_inline bool is_false() const noexcept; + simdjson_inline bool is_true() const noexcept; + simdjson_inline bool is_null_on_tape() const noexcept;// different name to avoid clash with is_null. + simdjson_inline uint32_t matching_brace_index() const noexcept; + simdjson_inline uint32_t scope_count() const noexcept; + template + simdjson_inline T next_tape_value() const noexcept; + simdjson_inline uint32_t get_string_length() const noexcept; + simdjson_inline const char * get_c_str() const noexcept; + inline std::string_view get_string_view() const noexcept; + simdjson_inline bool is_document_root() const noexcept; + simdjson_inline bool usable() const noexcept; + + /** The document this element references. */ + const dom::document *doc; + + /** The index of this element on `doc.tape[]` */ + size_t json_index; +}; + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_INTERNAL_TAPE_REF_H diff --git a/contrib/libs/simdjson/include/simdjson/internal/tape_type.h b/contrib/libs/simdjson/include/simdjson/internal/tape_type.h new file mode 100644 index 000000000000..d43c57c7a49e --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/internal/tape_type.h @@ -0,0 +1,28 @@ +#ifndef SIMDJSON_INTERNAL_TAPE_TYPE_H +#define SIMDJSON_INTERNAL_TAPE_TYPE_H + +namespace simdjson { +namespace internal { + +/** + * The possible types in the tape. + */ +enum class tape_type { + ROOT = 'r', + START_ARRAY = '[', + START_OBJECT = '{', + END_ARRAY = ']', + END_OBJECT = '}', + STRING = '"', + INT64 = 'l', + UINT64 = 'u', + DOUBLE = 'd', + TRUE_VALUE = 't', + FALSE_VALUE = 'f', + NULL_VALUE = 'n' +}; // enum class tape_type + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_INTERNAL_TAPE_TYPE_H diff --git a/contrib/libs/simdjson/include/simdjson/jsonioutil.h b/contrib/libs/simdjson/include/simdjson/jsonioutil.h new file mode 100644 index 000000000000..cff25ab69324 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/jsonioutil.h @@ -0,0 +1,22 @@ +#ifndef SIMDJSON_JSONIOUTIL_H +#define SIMDJSON_JSONIOUTIL_H + +#include "simdjson/base.h" +#include "simdjson/padded_string.h" + +#include "simdjson/padded_string-inl.h" + +namespace simdjson { + +#if SIMDJSON_EXCEPTIONS +#ifndef SIMDJSON_DISABLE_DEPRECATED_API +[[deprecated("Use padded_string::load() instead")]] +inline padded_string get_corpus(const char *path) { + return padded_string::load(path); +} +#endif // SIMDJSON_DISABLE_DEPRECATED_API +#endif // SIMDJSON_EXCEPTIONS + +} // namespace simdjson + +#endif // SIMDJSON_JSONIOUTIL_H diff --git a/contrib/libs/simdjson/include/simdjson/lasx.h b/contrib/libs/simdjson/include/simdjson/lasx.h new file mode 100644 index 000000000000..37a20c89863c --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_LASX_H +#define SIMDJSON_LASX_H + +#include "simdjson/lasx/begin.h" +#include "simdjson/generic/amalgamated.h" +#include "simdjson/lasx/end.h" + +#endif // SIMDJSON_LASX_H diff --git a/contrib/libs/simdjson/include/simdjson/lasx/base.h b/contrib/libs/simdjson/include/simdjson/lasx/base.h new file mode 100644 index 000000000000..9d9a866c3756 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx/base.h @@ -0,0 +1,26 @@ +#ifndef SIMDJSON_LASX_BASE_H +#define SIMDJSON_LASX_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +/** + * Implementation for LASX. + */ +namespace lasx { + +class implementation; + +namespace { +namespace simd { +template struct simd8; +template struct simd8x64; +} // namespace simd +} // unnamed namespace + +} // namespace lasx +} // namespace simdjson + +#endif // SIMDJSON_LASX_BASE_H diff --git a/contrib/libs/simdjson/include/simdjson/lasx/begin.h b/contrib/libs/simdjson/include/simdjson/lasx/begin.h new file mode 100644 index 000000000000..560eba737ce8 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx/begin.h @@ -0,0 +1,10 @@ +#define SIMDJSON_IMPLEMENTATION lasx +#include "simdjson/lasx/base.h" +#include "simdjson/lasx/intrinsics.h" +#include "simdjson/lasx/bitmanipulation.h" +#include "simdjson/lasx/bitmask.h" +#include "simdjson/lasx/numberparsing_defs.h" +#include "simdjson/lasx/simd.h" +#include "simdjson/lasx/stringparsing_defs.h" + +#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1 diff --git a/contrib/libs/simdjson/include/simdjson/lasx/bitmanipulation.h b/contrib/libs/simdjson/include/simdjson/lasx/bitmanipulation.h new file mode 100644 index 000000000000..962ddbf56bc4 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx/bitmanipulation.h @@ -0,0 +1,50 @@ +#ifndef SIMDJSON_LASX_BITMANIPULATION_H +#define SIMDJSON_LASX_BITMANIPULATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lasx/base.h" +#include "simdjson/lasx/intrinsics.h" +#include "simdjson/lasx/bitmask.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace lasx { +namespace { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +SIMDJSON_NO_SANITIZE_UNDEFINED +// This function can be used safely even if not all bytes have been +// initialized. +// See issue https://github.com/simdjson/simdjson/issues/1965 +SIMDJSON_NO_SANITIZE_MEMORY +simdjson_inline int trailing_zeroes(uint64_t input_num) { + return __builtin_ctzll(input_num); +} + +/* result might be undefined when input_num is zero */ +simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return input_num & (input_num-1); +} + +/* result might be undefined when input_num is zero */ +simdjson_inline int leading_zeroes(uint64_t input_num) { + return __builtin_clzll(input_num); +} + +/* result might be undefined when input_num is zero */ +simdjson_inline int count_ones(uint64_t input_num) { + return __lasx_xvpickve2gr_w(__lasx_xvpcnt_d(__m256i(v4u64{input_num, 0, 0, 0})), 0); +} + +simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { + return __builtin_uaddll_overflow(value1, value2, + reinterpret_cast(result)); +} + +} // unnamed namespace +} // namespace lasx +} // namespace simdjson + +#endif // SIMDJSON_LASX_BITMANIPULATION_H diff --git a/contrib/libs/simdjson/include/simdjson/lasx/bitmask.h b/contrib/libs/simdjson/include/simdjson/lasx/bitmask.h new file mode 100644 index 000000000000..e847c1d71000 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx/bitmask.h @@ -0,0 +1,31 @@ +#ifndef SIMDJSON_LASX_BITMASK_H +#define SIMDJSON_LASX_BITMASK_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lasx/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace lasx { +namespace { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +simdjson_inline uint64_t prefix_xor(uint64_t bitmask) { + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + return bitmask; +} + +} // unnamed namespace +} // namespace lasx +} // namespace simdjson + +#endif diff --git a/contrib/libs/simdjson/include/simdjson/lasx/end.h b/contrib/libs/simdjson/include/simdjson/lasx/end.h new file mode 100644 index 000000000000..2f5ec807907f --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx/end.h @@ -0,0 +1,6 @@ +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lasx/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT +#undef SIMDJSON_IMPLEMENTATION diff --git a/contrib/libs/simdjson/include/simdjson/lasx/implementation.h b/contrib/libs/simdjson/include/simdjson/lasx/implementation.h new file mode 100644 index 000000000000..8aafbb8b88b1 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx/implementation.h @@ -0,0 +1,31 @@ +#ifndef SIMDJSON_LASX_IMPLEMENTATION_H +#define SIMDJSON_LASX_IMPLEMENTATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/base.h" +#include "simdjson/implementation.h" +#include "simdjson/internal/instruction_set.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace lasx { + +/** + * @private + */ +class implementation final : public simdjson::implementation { +public: + simdjson_inline implementation() : simdjson::implementation("lasx", "LoongArch ASX", internal::instruction_set::LASX) {} + simdjson_warn_unused error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final; + simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; +}; + +} // namespace lasx +} // namespace simdjson + +#endif // SIMDJSON_LASX_IMPLEMENTATION_H diff --git a/contrib/libs/simdjson/include/simdjson/lasx/intrinsics.h b/contrib/libs/simdjson/include/simdjson/lasx/intrinsics.h new file mode 100644 index 000000000000..47b152d9a4d1 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx/intrinsics.h @@ -0,0 +1,14 @@ +#ifndef SIMDJSON_LASX_INTRINSICS_H +#define SIMDJSON_LASX_INTRINSICS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lasx/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// This should be the correct header whether +// you use visual studio or other compilers. +#error #include + +static_assert(sizeof(__m256i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for LoongArch ASX"); + +#endif // SIMDJSON_LASX_INTRINSICS_H diff --git a/contrib/libs/simdjson/include/simdjson/lasx/numberparsing_defs.h b/contrib/libs/simdjson/include/simdjson/lasx/numberparsing_defs.h new file mode 100644 index 000000000000..cb9278261c6f --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx/numberparsing_defs.h @@ -0,0 +1,41 @@ +#ifndef SIMDJSON_LASX_NUMBERPARSING_DEFS_H +#define SIMDJSON_LASX_NUMBERPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lasx/base.h" +#include "simdjson/lasx/intrinsics.h" +#include "simdjson/internal/numberparsing_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include + +namespace simdjson { +namespace lasx { +namespace numberparsing { + +// we don't have appropriate instructions, so let us use a scalar function +// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ +/** @private */ +static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) { + uint64_t val; + std::memcpy(&val, chars, sizeof(uint64_t)); + val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8; + val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16; + return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32); +} + +simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) { + internal::value128 answer; + __uint128_t r = (static_cast<__uint128_t>(value1)) * value2; + answer.low = uint64_t(r); + answer.high = uint64_t(r >> 64); + return answer; +} + +} // namespace numberparsing +} // namespace lasx +} // namespace simdjson + +#define SIMDJSON_SWAR_NUMBER_PARSING 1 + +#endif // SIMDJSON_LASX_NUMBERPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/lasx/ondemand.h b/contrib/libs/simdjson/include/simdjson/lasx/ondemand.h new file mode 100644 index 000000000000..9f7ab96fa6dd --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx/ondemand.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_LASX_ONDEMAND_H +#define SIMDJSON_LASX_ONDEMAND_H + +#include "simdjson/lasx/begin.h" +#include "simdjson/generic/ondemand/amalgamated.h" +#include "simdjson/lasx/end.h" + +#endif // SIMDJSON_LASX_ONDEMAND_H diff --git a/contrib/libs/simdjson/include/simdjson/lasx/simd.h b/contrib/libs/simdjson/include/simdjson/lasx/simd.h new file mode 100644 index 000000000000..907a8acb9b36 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx/simd.h @@ -0,0 +1,376 @@ +#ifndef SIMDJSON_LASX_SIMD_H +#define SIMDJSON_LASX_SIMD_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lasx/base.h" +#include "simdjson/lasx/bitmanipulation.h" +#include "simdjson/internal/simdprune_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace lasx { +namespace { +namespace simd { + + // Forward-declared so they can be used by splat and friends. + template + struct base { + __m256i value; + + // Zero constructor + simdjson_inline base() : value{__m256i()} {} + + // Conversion from SIMD register + simdjson_inline base(const __m256i _value) : value(_value) {} + + // Conversion to SIMD register + simdjson_inline operator const __m256i&() const { return this->value; } + simdjson_inline operator __m256i&() { return this->value; } + simdjson_inline operator const v32i8&() const { return (v32i8&)this->value; } + simdjson_inline operator v32i8&() { return (v32i8&)this->value; } + + // Bit operations + simdjson_inline Child operator|(const Child other) const { return __lasx_xvor_v(*this, other); } + simdjson_inline Child operator&(const Child other) const { return __lasx_xvand_v(*this, other); } + simdjson_inline Child operator^(const Child other) const { return __lasx_xvxor_v(*this, other); } + simdjson_inline Child bit_andnot(const Child other) const { return __lasx_xvandn_v(other, *this); } + simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast | other; return *this_cast; } + simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast & other; return *this_cast; } + simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast ^ other; return *this_cast; } + }; + + // Forward-declared so they can be used by splat and friends. + template + struct simd8; + + template> + struct base8: base> { + simdjson_inline base8() : base>() {} + simdjson_inline base8(const __m256i _value) : base>(_value) {} + + friend simdjson_really_inline Mask operator==(const simd8 lhs, const simd8 rhs) { return __lasx_xvseq_b(lhs, rhs); } + + static const int SIZE = sizeof(base>::value); + + template + simdjson_inline simd8 prev(const simd8 prev_chunk) const { + __m256i hi = __lasx_xvbsll_v(*this, N); + __m256i lo = __lasx_xvbsrl_v(*this, 16 - N); + __m256i tmp = __lasx_xvbsrl_v(prev_chunk, 16 - N); + lo = __lasx_xvpermi_q(lo, tmp, 0x21); + return __lasx_xvor_v(hi, lo); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base8 { + static simdjson_inline simd8 splat(bool _value) { return __lasx_xvreplgr2vr_b(uint8_t(-(!!_value))); } + + simdjson_inline simd8() : base8() {} + simdjson_inline simd8(const __m256i _value) : base8(_value) {} + // Splat constructor + simdjson_inline simd8(bool _value) : base8(splat(_value)) {} + + simdjson_inline int to_bitmask() const { + __m256i mask = __lasx_xvmskltz_b(*this); + return (__lasx_xvpickve2gr_w(mask, 4) << 16) | (__lasx_xvpickve2gr_w(mask, 0)); + } + simdjson_inline bool any() const { + __m256i v = __lasx_xvmsknz_b(*this); + return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4)); + } + simdjson_inline simd8 operator~() const { return *this ^ true; } + }; + + template + struct base8_numeric: base8 { + static simdjson_inline simd8 splat(T _value) { + return __lasx_xvreplgr2vr_b(_value); + } + static simdjson_inline simd8 zero() { return __lasx_xvldi(0); } + static simdjson_inline simd8 load(const T values[32]) { + return __lasx_xvld(reinterpret_cast(values), 0); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdjson_inline simd8 repeat_16( + T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + simdjson_inline base8_numeric() : base8() {} + simdjson_inline base8_numeric(const __m256i _value) : base8(_value) {} + + // Store to array + simdjson_inline void store(T dst[32]) const { + return __lasx_xvst(*this, reinterpret_cast<__m256i *>(dst), 0); + } + + // Addition/subtraction are the same for signed and unsigned + simdjson_inline simd8 operator+(const simd8 other) const { return __lasx_xvadd_b(*this, other); } + simdjson_inline simd8 operator-(const simd8 other) const { return __lasx_xvsub_b(*this, other); } + simdjson_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *static_cast*>(this); } + simdjson_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *static_cast*>(this); } + + // Override to distinguish from bool version + simdjson_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdjson_inline simd8 lookup_16(simd8 lookup_table) const { + return __lasx_xvshuf_b(lookup_table, lookup_table, *this); + } + + // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). + // Passing a 0 value for mask would be equivalent to writing out every byte to output. + // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes + // get written. + template + simdjson_inline void compress(uint32_t mask, L * output) const { + using internal::thintable_epi8; + using internal::BitsSetTable256mul2; + using internal::pshufb_combine_table; + // this particular implementation was inspired by haswell + // lasx do it in 4 steps, first 8 bytes and then second 8 bytes... + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // second significant 8 bits + uint8_t mask3 = uint8_t(mask >> 16); // ... + uint8_t mask4 = uint8_t(mask >> 24); // ... + // next line just loads the 64-bit values thintable_epi8[mask{1,2,3,4}] + // into a 256-bit register. + __m256i shufmask = {int64_t(thintable_epi8[mask1]), int64_t(thintable_epi8[mask2]) + 0x0808080808080808, int64_t(thintable_epi8[mask3]), int64_t(thintable_epi8[mask4]) + 0x0808080808080808}; + // this is the version "nearly pruned" + __m256i pruned = __lasx_xvshuf_b(*this, *this, shufmask); + // we still need to put the pieces back together. + // we compute the popcount of the first words: + int pop1 = BitsSetTable256mul2[mask1]; + int pop2 = BitsSetTable256mul2[mask2]; + int pop3 = BitsSetTable256mul2[mask3]; + + // then load the corresponding mask + __m256i masklo = __lasx_xvldx(reinterpret_cast(reinterpret_cast(pshufb_combine_table)), pop1 * 8); + __m256i maskhi = __lasx_xvldx(reinterpret_cast(reinterpret_cast(pshufb_combine_table)), pop3 * 8); + __m256i compactmask = __lasx_xvpermi_q(maskhi, masklo, 0x20); + __m256i answer = __lasx_xvshuf_b(pruned, pruned, compactmask); + __lasx_xvst(answer, reinterpret_cast(output), 0); + uint64_t value3 = __lasx_xvpickve2gr_du(answer, 2); + uint64_t value4 = __lasx_xvpickve2gr_du(answer, 3); + uint64_t *pos = reinterpret_cast(reinterpret_cast(output) + 16 - (pop1 + pop2) / 2); + pos[0] = value3; + pos[1] = value4; + } + + template + simdjson_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + }; + + // Signed bytes + template<> + struct simd8 : base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m256i _value) : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const int8_t values[32]) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, + int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23, + int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31 + ) : simd8({ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v16,v17,v18,v19,v20,v21,v22,v23, + v24,v25,v26,v27,v28,v29,v30,v31 + }) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Order-sensitive comparisons + simdjson_inline simd8 max_val(const simd8 other) const { return __lasx_xvmax_b(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return __lasx_xvmin_b(*this, other); } + simdjson_inline simd8 operator>(const simd8 other) const { return __lasx_xvslt_b(other, *this); } + simdjson_inline simd8 operator<(const simd8 other) const { return __lasx_xvslt_b(*this, other); } + }; + + // Unsigned bytes + template<> + struct simd8: base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m256i _value) : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const uint8_t values[32]) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15, + uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23, + uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31 + ) : simd8(__m256i(v32u8{ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v16,v17,v18,v19,v20,v21,v22,v23, + v24,v25,v26,v27,v28,v29,v30,v31 + })) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Saturated math + simdjson_inline simd8 saturating_add(const simd8 other) const { return __lasx_xvsadd_bu(*this, other); } + simdjson_inline simd8 saturating_sub(const simd8 other) const { return __lasx_xvssub_bu(*this, other); } + + // Order-specific operations + simdjson_inline simd8 max_val(const simd8 other) const { return __lasx_xvmax_bu(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return __lasx_xvmin_bu(other, *this); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } + simdjson_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } + simdjson_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } + simdjson_inline simd8 operator>(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + simdjson_inline simd8 operator<(const simd8 other) const { return this->lt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdjson_inline simd8 bits_not_set() const { return *this == uint8_t(0); } + simdjson_inline simd8 bits_not_set(simd8 bits) const { return (*this & bits).bits_not_set(); } + simdjson_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } + simdjson_inline simd8 any_bits_set(simd8 bits) const { return ~this->bits_not_set(bits); } + simdjson_inline bool is_ascii() const { + __m256i mask = __lasx_xvmskltz_b(*this); + return (0 == __lasx_xvpickve2gr_w(mask, 0)) && (0 == __lasx_xvpickve2gr_w(mask, 4)); + } + simdjson_inline bool bits_not_set_anywhere() const { + __m256i v = __lasx_xvmsknz_b(*this); + return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4)); + } + simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdjson_inline bool bits_not_set_anywhere(simd8 bits) const { + __m256i v = __lasx_xvmsknz_b(__lasx_xvand_v(*this, bits)); + return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4)); + } + simdjson_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } + template + simdjson_inline simd8 shr() const { return simd8(__lasx_xvsrli_b(*this, N)); } + template + simdjson_inline simd8 shl() const { return simd8(__lasx_xvslli_b(*this, N)); } + }; + + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 2, "LASX kernel should use two registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8& other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdjson_inline simd8x64(const simd8 chunk0, const simd8 chunk1) : chunks{chunk0, chunk1} {} + simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr), simd8::load(ptr+32)} {} + + simdjson_inline uint64_t compress(uint64_t mask, T * output) const { + uint32_t mask1 = uint32_t(mask); + uint32_t mask2 = uint32_t(mask >> 32); + __m256i zcnt = __lasx_xvpcnt_w(__m256i(v4u64{~mask, 0, 0, 0})); + uint64_t zcnt1 = __lasx_xvpickve2gr_wu(zcnt, 0); + uint64_t zcnt2 = __lasx_xvpickve2gr_wu(zcnt, 1); + // There should be a critical value which processes in scaler is faster. + if (zcnt1) + this->chunks[0].compress(mask1, output); + if (zcnt2) + this->chunks[1].compress(mask2, output + zcnt1); + return zcnt1 + zcnt2; + } + + simdjson_inline void store(T ptr[64]) const { + this->chunks[0].store(ptr+sizeof(simd8)*0); + this->chunks[1].store(ptr+sizeof(simd8)*1); + } + + simdjson_inline uint64_t to_bitmask() const { + __m256i mask0 = __lasx_xvmskltz_b(this->chunks[0]); + __m256i mask1 = __lasx_xvmskltz_b(this->chunks[1]); + __m256i mask_tmp = __lasx_xvpickve_w(mask0, 4); + __m256i tmp = __lasx_xvpickve_w(mask1, 4); + mask0 = __lasx_xvinsve0_w(mask0, mask1, 1); + mask_tmp = __lasx_xvinsve0_w(mask_tmp, tmp, 1); + return __lasx_xvpickve2gr_du(__lasx_xvpackev_h(mask_tmp, mask0), 0); + } + + simdjson_inline simd8 reduce_or() const { + return this->chunks[0] | this->chunks[1]; + } + + simdjson_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] == mask, + this->chunks[1] == mask + ).to_bitmask(); + } + + simdjson_inline uint64_t eq(const simd8x64 &other) const { + return simd8x64( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1] + ).to_bitmask(); + } + + simdjson_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] <= mask, + this->chunks[1] <= mask + ).to_bitmask(); + } + }; // struct simd8x64 + +} // namespace simd +} // unnamed namespace +} // namespace lasx +} // namespace simdjson + +#endif // SIMDJSON_LASX_SIMD_H diff --git a/contrib/libs/simdjson/include/simdjson/lasx/stringparsing_defs.h b/contrib/libs/simdjson/include/simdjson/lasx/stringparsing_defs.h new file mode 100644 index 000000000000..fe7a7430e007 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lasx/stringparsing_defs.h @@ -0,0 +1,47 @@ +#ifndef SIMDJSON_LASX_STRINGPARSING_DEFS_H +#define SIMDJSON_LASX_STRINGPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lasx/base.h" +#include "simdjson/lasx/simd.h" +#include "simdjson/lasx/bitmanipulation.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace lasx { +namespace { + +using namespace simd; + +// Holds backslashes and quotes locations. +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 32; + simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst); + + simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; } + simdjson_inline bool has_backslash() { return bs_bits != 0; } + simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); } + simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); } + + uint32_t bs_bits; + uint32_t quote_bits; +}; // struct backslash_and_quote + +simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { + // this can read up to 31 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes"); + simd8 v(src); + v.store(dst); + return { + static_cast((v == '\\').to_bitmask()), // bs_bits + static_cast((v == '"').to_bitmask()), // quote_bits + }; +} + +} // unnamed namespace +} // namespace lasx +} // namespace simdjson + +#endif // SIMDJSON_LASX_STRINGPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/lsx.h b/contrib/libs/simdjson/include/simdjson/lsx.h new file mode 100644 index 000000000000..1496e9ceb6fe --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_LSX_H +#define SIMDJSON_LSX_H + +#include "simdjson/lsx/begin.h" +#include "simdjson/generic/amalgamated.h" +#include "simdjson/lsx/end.h" + +#endif // SIMDJSON_LSX_H diff --git a/contrib/libs/simdjson/include/simdjson/lsx/base.h b/contrib/libs/simdjson/include/simdjson/lsx/base.h new file mode 100644 index 000000000000..ff02450184a8 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx/base.h @@ -0,0 +1,26 @@ +#ifndef SIMDJSON_LSX_BASE_H +#define SIMDJSON_LSX_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +/** + * Implementation for LSX. + */ +namespace lsx { + +class implementation; + +namespace { +namespace simd { +template struct simd8; +template struct simd8x64; +} // namespace simd +} // unnamed namespace + +} // namespace lsx +} // namespace simdjson + +#endif // SIMDJSON_LSX_BASE_H diff --git a/contrib/libs/simdjson/include/simdjson/lsx/begin.h b/contrib/libs/simdjson/include/simdjson/lsx/begin.h new file mode 100644 index 000000000000..78a92819a346 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx/begin.h @@ -0,0 +1,10 @@ +#define SIMDJSON_IMPLEMENTATION lsx +#include "simdjson/lsx/base.h" +#include "simdjson/lsx/intrinsics.h" +#include "simdjson/lsx/bitmanipulation.h" +#include "simdjson/lsx/bitmask.h" +#include "simdjson/lsx/numberparsing_defs.h" +#include "simdjson/lsx/simd.h" +#include "simdjson/lsx/stringparsing_defs.h" + +#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1 diff --git a/contrib/libs/simdjson/include/simdjson/lsx/bitmanipulation.h b/contrib/libs/simdjson/include/simdjson/lsx/bitmanipulation.h new file mode 100644 index 000000000000..96e1794bae54 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx/bitmanipulation.h @@ -0,0 +1,50 @@ +#ifndef SIMDJSON_LSX_BITMANIPULATION_H +#define SIMDJSON_LSX_BITMANIPULATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lsx/base.h" +#include "simdjson/lsx/intrinsics.h" +#include "simdjson/lsx/bitmask.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace lsx { +namespace { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +SIMDJSON_NO_SANITIZE_UNDEFINED +// This function can be used safely even if not all bytes have been +// initialized. +// See issue https://github.com/simdjson/simdjson/issues/1965 +SIMDJSON_NO_SANITIZE_MEMORY +simdjson_inline int trailing_zeroes(uint64_t input_num) { + return __builtin_ctzll(input_num); +} + +/* result might be undefined when input_num is zero */ +simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return input_num & (input_num-1); +} + +/* result might be undefined when input_num is zero */ +simdjson_inline int leading_zeroes(uint64_t input_num) { + return __builtin_clzll(input_num); +} + +/* result might be undefined when input_num is zero */ +simdjson_inline int count_ones(uint64_t input_num) { + return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__m128i(v2u64{input_num, 0})), 0); +} + +simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { + return __builtin_uaddll_overflow(value1, value2, + reinterpret_cast(result)); +} + +} // unnamed namespace +} // namespace lsx +} // namespace simdjson + +#endif // SIMDJSON_LSX_BITMANIPULATION_H diff --git a/contrib/libs/simdjson/include/simdjson/lsx/bitmask.h b/contrib/libs/simdjson/include/simdjson/lsx/bitmask.h new file mode 100644 index 000000000000..3a9f0d768c5e --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx/bitmask.h @@ -0,0 +1,31 @@ +#ifndef SIMDJSON_LSX_BITMASK_H +#define SIMDJSON_LSX_BITMASK_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lsx/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace lsx { +namespace { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +simdjson_inline uint64_t prefix_xor(uint64_t bitmask) { + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + return bitmask; +} + +} // unnamed namespace +} // namespace lsx +} // namespace simdjson + +#endif diff --git a/contrib/libs/simdjson/include/simdjson/lsx/end.h b/contrib/libs/simdjson/include/simdjson/lsx/end.h new file mode 100644 index 000000000000..0ae4d372286d --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx/end.h @@ -0,0 +1,6 @@ +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lsx/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT +#undef SIMDJSON_IMPLEMENTATION diff --git a/contrib/libs/simdjson/include/simdjson/lsx/implementation.h b/contrib/libs/simdjson/include/simdjson/lsx/implementation.h new file mode 100644 index 000000000000..14468777de71 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx/implementation.h @@ -0,0 +1,31 @@ +#ifndef SIMDJSON_LSX_IMPLEMENTATION_H +#define SIMDJSON_LSX_IMPLEMENTATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/base.h" +#include "simdjson/implementation.h" +#include "simdjson/internal/instruction_set.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace lsx { + +/** + * @private + */ +class implementation final : public simdjson::implementation { +public: + simdjson_inline implementation() : simdjson::implementation("lsx", "LoongArch SX", internal::instruction_set::LSX) {} + simdjson_warn_unused error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final; + simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; +}; + +} // namespace lsx +} // namespace simdjson + +#endif // SIMDJSON_LSX_IMPLEMENTATION_H diff --git a/contrib/libs/simdjson/include/simdjson/lsx/intrinsics.h b/contrib/libs/simdjson/include/simdjson/lsx/intrinsics.h new file mode 100644 index 000000000000..c4cf1dd60942 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx/intrinsics.h @@ -0,0 +1,14 @@ +#ifndef SIMDJSON_LSX_INTRINSICS_H +#define SIMDJSON_LSX_INTRINSICS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lsx/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// This should be the correct header whether +// you use visual studio or other compilers. +#error #include + +static_assert(sizeof(__m128i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for LoongArch SX"); + +#endif // SIMDJSON_LSX_INTRINSICS_H diff --git a/contrib/libs/simdjson/include/simdjson/lsx/numberparsing_defs.h b/contrib/libs/simdjson/include/simdjson/lsx/numberparsing_defs.h new file mode 100644 index 000000000000..1dfe4264e925 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx/numberparsing_defs.h @@ -0,0 +1,41 @@ +#ifndef SIMDJSON_LSX_NUMBERPARSING_DEFS_H +#define SIMDJSON_LSX_NUMBERPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lsx/base.h" +#include "simdjson/lsx/intrinsics.h" +#include "simdjson/internal/numberparsing_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include + +namespace simdjson { +namespace lsx { +namespace numberparsing { + +// we don't have appropriate instructions, so let us use a scalar function +// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ +/** @private */ +static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) { + uint64_t val; + std::memcpy(&val, chars, sizeof(uint64_t)); + val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8; + val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16; + return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32); +} + +simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) { + internal::value128 answer; + __uint128_t r = (static_cast<__uint128_t>(value1)) * value2; + answer.low = uint64_t(r); + answer.high = uint64_t(r >> 64); + return answer; +} + +} // namespace numberparsing +} // namespace lsx +} // namespace simdjson + +#define SIMDJSON_SWAR_NUMBER_PARSING 1 + +#endif // SIMDJSON_LSX_NUMBERPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/lsx/ondemand.h b/contrib/libs/simdjson/include/simdjson/lsx/ondemand.h new file mode 100644 index 000000000000..b1b612e17145 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx/ondemand.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_LSX_ONDEMAND_H +#define SIMDJSON_LSX_ONDEMAND_H + +#include "simdjson/lsx/begin.h" +#include "simdjson/generic/ondemand/amalgamated.h" +#include "simdjson/lsx/end.h" + +#endif // SIMDJSON_LSX_ONDEMAND_H diff --git a/contrib/libs/simdjson/include/simdjson/lsx/simd.h b/contrib/libs/simdjson/include/simdjson/lsx/simd.h new file mode 100644 index 000000000000..3f0d66560d35 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx/simd.h @@ -0,0 +1,354 @@ +#ifndef SIMDJSON_LSX_SIMD_H +#define SIMDJSON_LSX_SIMD_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lsx/base.h" +#include "simdjson/lsx/bitmanipulation.h" +#include "simdjson/internal/simdprune_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace lsx { +namespace { +namespace simd { + + // Forward-declared so they can be used by splat and friends. + template + struct base { + __m128i value; + + // Zero constructor + simdjson_inline base() : value{__m128i()} {} + + // Conversion from SIMD register + simdjson_inline base(const __m128i _value) : value(_value) {} + + // Conversion to SIMD register + simdjson_inline operator const __m128i&() const { return this->value; } + simdjson_inline operator __m128i&() { return this->value; } + simdjson_inline operator const v16i8&() const { return (v16i8&)this->value; } + simdjson_inline operator v16i8&() { return (v16i8&)this->value; } + + // Bit operations + simdjson_inline Child operator|(const Child other) const { return __lsx_vor_v(*this, other); } + simdjson_inline Child operator&(const Child other) const { return __lsx_vand_v(*this, other); } + simdjson_inline Child operator^(const Child other) const { return __lsx_vxor_v(*this, other); } + simdjson_inline Child bit_andnot(const Child other) const { return __lsx_vandn_v(other, *this); } + simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast | other; return *this_cast; } + simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast & other; return *this_cast; } + simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast ^ other; return *this_cast; } + }; + + // Forward-declared so they can be used by splat and friends. + template + struct simd8; + + template> + struct base8: base> { + simdjson_inline base8() : base>() {} + simdjson_inline base8(const __m128i _value) : base>(_value) {} + + friend simdjson_really_inline Mask operator==(const simd8 lhs, const simd8 rhs) { return __lsx_vseq_b(lhs, rhs); } + + static const int SIZE = sizeof(base>::value); + + template + simdjson_inline simd8 prev(const simd8 prev_chunk) const { + return __lsx_vor_v(__lsx_vbsll_v(*this, N), __lsx_vbsrl_v(prev_chunk, 16 - N)); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base8 { + static simdjson_inline simd8 splat(bool _value) { + return __lsx_vreplgr2vr_b(uint8_t(-(!!_value))); + } + + simdjson_inline simd8() : base8() {} + simdjson_inline simd8(const __m128i _value) : base8(_value) {} + // Splat constructor + simdjson_inline simd8(bool _value) : base8(splat(_value)) {} + + simdjson_inline int to_bitmask() const { return __lsx_vpickve2gr_w(__lsx_vmskltz_b(*this), 0); } + simdjson_inline bool any() const { return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0); } + simdjson_inline simd8 operator~() const { return *this ^ true; } + }; + + template + struct base8_numeric: base8 { + static simdjson_inline simd8 splat(T _value) { return __lsx_vreplgr2vr_b(_value); } + static simdjson_inline simd8 zero() { return __lsx_vldi(0); } + static simdjson_inline simd8 load(const T values[16]) { + return __lsx_vld(reinterpret_cast(values), 0); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdjson_inline simd8 repeat_16( + T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + simdjson_inline base8_numeric() : base8() {} + simdjson_inline base8_numeric(const __m128i _value) : base8(_value) {} + + // Store to array + simdjson_inline void store(T dst[16]) const { + return __lsx_vst(*this, reinterpret_cast<__m128i *>(dst), 0); + } + + // Addition/subtraction are the same for signed and unsigned + simdjson_inline simd8 operator+(const simd8 other) const { return __lsx_vadd_b(*this, other); } + simdjson_inline simd8 operator-(const simd8 other) const { return __lsx_vsub_b(*this, other); } + simdjson_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *static_cast*>(this); } + simdjson_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *static_cast*>(this); } + + // Override to distinguish from bool version + simdjson_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdjson_inline simd8 lookup_16(simd8 lookup_table) const { + return __lsx_vshuf_b(lookup_table, lookup_table, *this); + } + + // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). + // Passing a 0 value for mask would be equivalent to writing out every byte to output. + // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes + // get written. + template + simdjson_inline void compress(uint16_t mask, L * output) const { + using internal::thintable_epi8; + using internal::BitsSetTable256mul2; + using internal::pshufb_combine_table; + // this particular implementation was inspired by haswell + // lsx do it in 2 steps, first 8 bytes and then second 8 bytes... + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // second least significant 8 bits + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register. + __m128i shufmask = {int64_t(thintable_epi8[mask1]), int64_t(thintable_epi8[mask2]) + 0x0808080808080808}; + // this is the version "nearly pruned" + __m128i pruned = __lsx_vshuf_b(*this, *this, shufmask); + // we still need to put the pieces back together. + // we compute the popcount of the first words: + int pop1 = BitsSetTable256mul2[mask1]; + // then load the corresponding mask + __m128i compactmask = __lsx_vldx(reinterpret_cast(reinterpret_cast(pshufb_combine_table)), pop1 * 8); + __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask); + __lsx_vst(answer, reinterpret_cast(output), 0); + } + + template + simdjson_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + }; + + // Signed bytes + template<> + struct simd8 : base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m128i _value) : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const int8_t values[16]) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8({ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + }) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Order-sensitive comparisons + simdjson_inline simd8 max_val(const simd8 other) const { return __lsx_vmax_b(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return __lsx_vmin_b(*this, other); } + simdjson_inline simd8 operator>(const simd8 other) const { return __lsx_vslt_b(other, *this); } + simdjson_inline simd8 operator<(const simd8 other) const { return __lsx_vslt_b(*this, other); } + }; + + // Unsigned bytes + template<> + struct simd8: base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m128i _value) : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(__m128i(v16u8{ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + })) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Saturated math + simdjson_inline simd8 saturating_add(const simd8 other) const { return __lsx_vsadd_bu(*this, other); } + simdjson_inline simd8 saturating_sub(const simd8 other) const { return __lsx_vssub_bu(*this, other); } + + // Order-specific operations + simdjson_inline simd8 max_val(const simd8 other) const { return __lsx_vmax_bu(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return __lsx_vmin_bu(other, *this); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } + simdjson_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } + simdjson_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } + simdjson_inline simd8 operator>(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + simdjson_inline simd8 operator<(const simd8 other) const { return this->lt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdjson_inline simd8 bits_not_set() const { return *this == uint8_t(0); } + simdjson_inline simd8 bits_not_set(simd8 bits) const { return (*this & bits).bits_not_set(); } + simdjson_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } + simdjson_inline simd8 any_bits_set(simd8 bits) const { return ~this->bits_not_set(bits); } + simdjson_inline bool is_ascii() const { return 0 == __lsx_vpickve2gr_w(__lsx_vmskltz_b(*this), 0); } + simdjson_inline bool bits_not_set_anywhere() const { return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0); } + simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdjson_inline bool bits_not_set_anywhere(simd8 bits) const { + return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(__lsx_vand_v(*this, bits)), 0); + } + simdjson_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } + template + simdjson_inline simd8 shr() const { return simd8(__lsx_vsrli_b(*this, N)); } + template + simdjson_inline simd8 shl() const { return simd8(__lsx_vslli_b(*this, N)); } + }; + + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, "LSX kernel should use four registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8& other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdjson_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr), simd8::load(ptr+16), simd8::load(ptr+32), simd8::load(ptr+48)} {} + + simdjson_inline uint64_t compress(uint64_t mask, T * output) const { + uint16_t mask1 = uint16_t(mask); + uint16_t mask2 = uint16_t(mask >> 16); + uint16_t mask3 = uint16_t(mask >> 32); + uint16_t mask4 = uint16_t(mask >> 48); + __m128i zcnt = __lsx_vpcnt_h(__m128i(v2u64{~mask, 0})); + uint64_t zcnt1 = __lsx_vpickve2gr_hu(zcnt, 0); + uint64_t zcnt2 = __lsx_vpickve2gr_hu(zcnt, 1); + uint64_t zcnt3 = __lsx_vpickve2gr_hu(zcnt, 2); + uint64_t zcnt4 = __lsx_vpickve2gr_hu(zcnt, 3); + uint8_t *voutput = reinterpret_cast(output); + // There should be a critical value which processes in scaler is faster. + if (zcnt1) + this->chunks[0].compress(mask1, reinterpret_cast(voutput)); + voutput += zcnt1; + if (zcnt2) + this->chunks[1].compress(mask2, reinterpret_cast(voutput)); + voutput += zcnt2; + if (zcnt3) + this->chunks[2].compress(mask3, reinterpret_cast(voutput)); + voutput += zcnt3; + if (zcnt4) + this->chunks[3].compress(mask4, reinterpret_cast(voutput)); + voutput += zcnt4; + return reinterpret_cast(voutput) - reinterpret_cast(output); + } + + simdjson_inline void store(T ptr[64]) const { + this->chunks[0].store(ptr+sizeof(simd8)*0); + this->chunks[1].store(ptr+sizeof(simd8)*1); + this->chunks[2].store(ptr+sizeof(simd8)*2); + this->chunks[3].store(ptr+sizeof(simd8)*3); + } + + simdjson_inline uint64_t to_bitmask() const { + __m128i mask1 = __lsx_vmskltz_b(this->chunks[0]); + __m128i mask2 = __lsx_vmskltz_b(this->chunks[1]); + __m128i mask3 = __lsx_vmskltz_b(this->chunks[2]); + __m128i mask4 = __lsx_vmskltz_b(this->chunks[3]); + mask1 = __lsx_vilvl_h(mask2, mask1); + mask2 = __lsx_vilvl_h(mask4, mask3); + return __lsx_vpickve2gr_du(__lsx_vilvl_w(mask2, mask1), 0); + } + + simdjson_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); + } + + simdjson_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] == mask, + this->chunks[1] == mask, + this->chunks[2] == mask, + this->chunks[3] == mask + ).to_bitmask(); + } + + simdjson_inline uint64_t eq(const simd8x64 &other) const { + return simd8x64( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1], + this->chunks[2] == other.chunks[2], + this->chunks[3] == other.chunks[3] + ).to_bitmask(); + } + + simdjson_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] <= mask, + this->chunks[1] <= mask, + this->chunks[2] <= mask, + this->chunks[3] <= mask + ).to_bitmask(); + } + }; // struct simd8x64 + +} // namespace simd +} // unnamed namespace +} // namespace lsx +} // namespace simdjson + +#endif // SIMDJSON_LSX_SIMD_H diff --git a/contrib/libs/simdjson/include/simdjson/lsx/stringparsing_defs.h b/contrib/libs/simdjson/include/simdjson/lsx/stringparsing_defs.h new file mode 100644 index 000000000000..af493dc55f64 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/lsx/stringparsing_defs.h @@ -0,0 +1,53 @@ +#ifndef SIMDJSON_LSX_STRINGPARSING_DEFS_H +#define SIMDJSON_LSX_STRINGPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/lsx/base.h" +#include "simdjson/lsx/simd.h" +#include "simdjson/lsx/bitmanipulation.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace lsx { +namespace { + +using namespace simd; + +// Holds backslashes and quotes locations. +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 32; + simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst); + + simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; } + simdjson_inline bool has_backslash() { return bs_bits != 0; } + simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); } + simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); } + + uint32_t bs_bits; + uint32_t quote_bits; +}; // struct backslash_and_quote + +simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { + // this can read up to 31 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes"); + simd8 v0(src); + simd8 v1(src + sizeof(v0)); + v0.store(dst); + v1.store(dst + sizeof(v0)); + + // Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on LSX; therefore, we + // smash them together into a 64-byte mask and get the bitmask from there. + uint64_t bs_and_quote = simd8x64(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask(); + return { + uint32_t(bs_and_quote), // bs_bits + uint32_t(bs_and_quote >> 32) // quote_bits + }; +} + +} // unnamed namespace +} // namespace lsx +} // namespace simdjson + +#endif // SIMDJSON_LSX_STRINGPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/minify.h b/contrib/libs/simdjson/include/simdjson/minify.h new file mode 100644 index 000000000000..8b8f217e4f83 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/minify.h @@ -0,0 +1,30 @@ +#ifndef SIMDJSON_MINIFY_H +#define SIMDJSON_MINIFY_H + +#include "simdjson/base.h" +#include "simdjson/padded_string.h" +#include +#include +#include + +namespace simdjson { + +/** + * + * Minify the input string assuming that it represents a JSON string, does not parse or validate. + * This function is much faster than parsing a JSON string and then writing a minified version of it. + * However, it does not validate the input. It will merely return an error in simple cases (e.g., if + * there is a string that was never terminated). + * + * + * @param buf the json document to minify. + * @param len the length of the json document. + * @param dst the buffer to write the minified document to. *MUST* be allocated up to len bytes. + * @param dst_len the number of bytes written. Output only. + * @return the error code, or SUCCESS if there was no error. + */ +simdjson_warn_unused error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept; + +} // namespace simdjson + +#endif // SIMDJSON_MINIFY_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/ondemand.h b/contrib/libs/simdjson/include/simdjson/ondemand.h new file mode 100644 index 000000000000..d17615910d02 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ondemand.h @@ -0,0 +1,13 @@ +#ifndef SIMDJSON_ONDEMAND_H +#define SIMDJSON_ONDEMAND_H + +#include "simdjson/builtin/ondemand.h" + +namespace simdjson { + /** + * @copydoc simdjson::builtin::ondemand + */ + namespace ondemand = builtin::ondemand; +} // namespace simdjson + +#endif // SIMDJSON_ONDEMAND_H diff --git a/contrib/libs/simdjson/include/simdjson/padded_string-inl.h b/contrib/libs/simdjson/include/simdjson/padded_string-inl.h new file mode 100644 index 000000000000..abb566fa3275 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/padded_string-inl.h @@ -0,0 +1,190 @@ +#ifndef SIMDJSON_PADDED_STRING_INL_H +#define SIMDJSON_PADDED_STRING_INL_H + +#include "simdjson/padded_string.h" +#include "simdjson/padded_string_view.h" + +#include "simdjson/error-inl.h" +#include "simdjson/padded_string_view-inl.h" + +#include + +namespace simdjson { +namespace internal { + +// The allocate_padded_buffer function is a low-level function to allocate memory +// with padding so we can read past the "length" bytes safely. It is used by +// the padded_string class automatically. It returns nullptr in case +// of error: the caller should check for a null pointer. +// The length parameter is the maximum size in bytes of the string. +// The caller is responsible to free the memory (e.g., delete[] (...)). +inline char *allocate_padded_buffer(size_t length) noexcept { + const size_t totalpaddedlength = length + SIMDJSON_PADDING; + if(totalpaddedlength(1UL<<20)) { + return nullptr; + } +#endif + + char *padded_buffer = new (std::nothrow) char[totalpaddedlength]; + if (padded_buffer == nullptr) { + return nullptr; + } + // We write nulls in the padded region to avoid having uninitialized + // content which may trigger warning for some sanitizers + std::memset(padded_buffer + length, 0, totalpaddedlength - length); + return padded_buffer; +} // allocate_padded_buffer() + +} // namespace internal + + +inline padded_string::padded_string() noexcept = default; +inline padded_string::padded_string(size_t length) noexcept + : viable_size(length), data_ptr(internal::allocate_padded_buffer(length)) { +} +inline padded_string::padded_string(const char *data, size_t length) noexcept + : viable_size(length), data_ptr(internal::allocate_padded_buffer(length)) { + if ((data != nullptr) && (data_ptr != nullptr)) { + std::memcpy(data_ptr, data, length); + } +} +#ifdef __cpp_char8_t +inline padded_string::padded_string(const char8_t *data, size_t length) noexcept + : viable_size(length), data_ptr(internal::allocate_padded_buffer(length)) { + if ((data != nullptr) && (data_ptr != nullptr)) { + std::memcpy(data_ptr, reinterpret_cast(data), length); + } +} +#endif +// note: do not pass std::string arguments by value +inline padded_string::padded_string(const std::string & str_ ) noexcept + : viable_size(str_.size()), data_ptr(internal::allocate_padded_buffer(str_.size())) { + if (data_ptr != nullptr) { + std::memcpy(data_ptr, str_.data(), str_.size()); + } +} +// note: do pass std::string_view arguments by value +inline padded_string::padded_string(std::string_view sv_) noexcept + : viable_size(sv_.size()), data_ptr(internal::allocate_padded_buffer(sv_.size())) { + if(simdjson_unlikely(!data_ptr)) { + //allocation failed or zero size + viable_size = 0; + return; + } + if (sv_.size()) { + std::memcpy(data_ptr, sv_.data(), sv_.size()); + } +} +inline padded_string::padded_string(padded_string &&o) noexcept + : viable_size(o.viable_size), data_ptr(o.data_ptr) { + o.data_ptr = nullptr; // we take ownership + o.viable_size = 0; +} + +inline padded_string &padded_string::operator=(padded_string &&o) noexcept { + delete[] data_ptr; + data_ptr = o.data_ptr; + viable_size = o.viable_size; + o.data_ptr = nullptr; // we take ownership + o.viable_size = 0; + return *this; +} + +inline void padded_string::swap(padded_string &o) noexcept { + size_t tmp_viable_size = viable_size; + char *tmp_data_ptr = data_ptr; + viable_size = o.viable_size; + data_ptr = o.data_ptr; + o.data_ptr = tmp_data_ptr; + o.viable_size = tmp_viable_size; +} + +inline padded_string::~padded_string() noexcept { + delete[] data_ptr; +} + +inline size_t padded_string::size() const noexcept { return viable_size; } + +inline size_t padded_string::length() const noexcept { return viable_size; } + +inline const char *padded_string::data() const noexcept { return data_ptr; } + +inline char *padded_string::data() noexcept { return data_ptr; } + +inline padded_string::operator std::string_view() const { return std::string_view(data(), length()); } + +inline padded_string::operator padded_string_view() const noexcept { + return padded_string_view(data(), length(), length() + SIMDJSON_PADDING); +} + +inline simdjson_result padded_string::load(std::string_view filename) noexcept { + // Open the file + SIMDJSON_PUSH_DISABLE_WARNINGS + SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe + std::FILE *fp = std::fopen(filename.data(), "rb"); + SIMDJSON_POP_DISABLE_WARNINGS + + if (fp == nullptr) { + return IO_ERROR; + } + + // Get the file size + int ret; +#if SIMDJSON_VISUAL_STUDIO && !SIMDJSON_IS_32BITS + ret = _fseeki64(fp, 0, SEEK_END); +#else + ret = std::fseek(fp, 0, SEEK_END); +#endif // _WIN64 + if(ret < 0) { + std::fclose(fp); + return IO_ERROR; + } +#if SIMDJSON_VISUAL_STUDIO && !SIMDJSON_IS_32BITS + __int64 llen = _ftelli64(fp); + if(llen == -1L) { + std::fclose(fp); + return IO_ERROR; + } +#else + long llen = std::ftell(fp); + if((llen < 0) || (llen == LONG_MAX)) { + std::fclose(fp); + return IO_ERROR; + } +#endif + + // Allocate the padded_string + size_t len = static_cast(llen); + padded_string s(len); + if (s.data() == nullptr) { + std::fclose(fp); + return MEMALLOC; + } + + // Read the padded_string + std::rewind(fp); + size_t bytes_read = std::fread(s.data(), 1, len, fp); + if (std::fclose(fp) != 0 || bytes_read != len) { + return IO_ERROR; + } + + return s; +} + +} // namespace simdjson + +inline simdjson::padded_string operator "" _padded(const char *str, size_t len) { + return simdjson::padded_string(str, len); +} +#ifdef __cpp_char8_t +inline simdjson::padded_string operator "" _padded(const char8_t *str, size_t len) { + return simdjson::padded_string(reinterpret_cast(str), len); +} +#endif +#endif // SIMDJSON_PADDED_STRING_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/padded_string.h b/contrib/libs/simdjson/include/simdjson/padded_string.h new file mode 100644 index 000000000000..536669b4657b --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/padded_string.h @@ -0,0 +1,183 @@ +#ifndef SIMDJSON_PADDED_STRING_H +#define SIMDJSON_PADDED_STRING_H + +#include "simdjson/base.h" +#include "simdjson/error.h" + +#include "simdjson/error-inl.h" + +#include +#include +#include +#include + +namespace simdjson { + +class padded_string_view; + +/** + * String with extra allocation for ease of use with parser::parse() + * + * This is a move-only class, it cannot be copied. + */ +struct padded_string final { + + /** + * Create a new, empty padded string. + */ + explicit inline padded_string() noexcept; + /** + * Create a new padded string buffer. + * + * @param length the size of the string. + */ + explicit inline padded_string(size_t length) noexcept; + /** + * Create a new padded string by copying the given input. + * + * @param data the buffer to copy + * @param length the number of bytes to copy + */ + explicit inline padded_string(const char *data, size_t length) noexcept; +#ifdef __cpp_char8_t + explicit inline padded_string(const char8_t *data, size_t length) noexcept; +#endif + /** + * Create a new padded string by copying the given input. + * + * @param str_ the string to copy + */ + inline padded_string(const std::string & str_ ) noexcept; + /** + * Create a new padded string by copying the given input. + * + * @param sv_ the string to copy + */ + inline padded_string(std::string_view sv_) noexcept; + /** + * Move one padded string into another. + * + * The original padded string will be reduced to zero capacity. + * + * @param o the string to move. + */ + inline padded_string(padded_string &&o) noexcept; + /** + * Move one padded string into another. + * + * The original padded string will be reduced to zero capacity. + * + * @param o the string to move. + */ + inline padded_string &operator=(padded_string &&o) noexcept; + inline void swap(padded_string &o) noexcept; + ~padded_string() noexcept; + + /** + * The length of the string. + * + * Does not include padding. + */ + size_t size() const noexcept; + + /** + * The length of the string. + * + * Does not include padding. + */ + size_t length() const noexcept; + + /** + * The string data. + **/ + const char *data() const noexcept; + const uint8_t *u8data() const noexcept { return static_cast(static_cast(data_ptr));} + + /** + * The string data. + **/ + char *data() noexcept; + + /** + * Create a std::string_view with the same content. + */ + operator std::string_view() const; + + /** + * Create a padded_string_view with the same content. + */ + operator padded_string_view() const noexcept; + + /** + * Load this padded string from a file. + * + * ## Windows and Unicode + * + * Windows users who need to read files with non-ANSI characters in the + * name should set their code page to UTF-8 (65001) before calling this + * function. This should be the default with Windows 11 and better. + * Further, they may use the AreFileApisANSI function to determine whether + * the filename is interpreted using the ANSI or the system default OEM + * codepage, and they may call SetFileApisToOEM accordingly. + * + * @return IO_ERROR on error. Be mindful that on some 32-bit systems, + * the file size might be limited to 2 GB. + * + * @param path the path to the file. + **/ + inline static simdjson_result load(std::string_view path) noexcept; + +private: + padded_string &operator=(const padded_string &o) = delete; + padded_string(const padded_string &o) = delete; + + size_t viable_size{0}; + char *data_ptr{nullptr}; + +}; // padded_string + +/** + * Send padded_string instance to an output stream. + * + * @param out The output stream. + * @param s The padded_string instance. + * @throw if there is an error with the underlying output stream. simdjson itself will not throw. + */ +inline std::ostream& operator<<(std::ostream& out, const padded_string& s) { return out << s.data(); } + +#if SIMDJSON_EXCEPTIONS +/** + * Send padded_string instance to an output stream. + * + * @param out The output stream. + * @param s The padded_string instance. + * @throw simdjson_error if the result being printed has an error. If there is an error with the + * underlying output stream, that error will be propagated (simdjson_error will not be + * thrown). + */ +inline std::ostream& operator<<(std::ostream& out, simdjson_result &s) noexcept(false) { return out << s.value(); } +#endif + +} // namespace simdjson + +// This is deliberately outside of simdjson so that people get it without having to use the namespace +inline simdjson::padded_string operator "" _padded(const char *str, size_t len); +#ifdef __cpp_char8_t +inline simdjson::padded_string operator "" _padded(const char8_t *str, size_t len); +#endif + +namespace simdjson { +namespace internal { + +// The allocate_padded_buffer function is a low-level function to allocate memory +// with padding so we can read past the "length" bytes safely. It is used by +// the padded_string class automatically. It returns nullptr in case +// of error: the caller should check for a null pointer. +// The length parameter is the maximum size in bytes of the string. +// The caller is responsible to free the memory (e.g., delete[] (...)). +inline char *allocate_padded_buffer(size_t length) noexcept; + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_PADDED_STRING_H diff --git a/contrib/libs/simdjson/include/simdjson/padded_string_view-inl.h b/contrib/libs/simdjson/include/simdjson/padded_string_view-inl.h new file mode 100644 index 000000000000..1c1811de4c42 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/padded_string_view-inl.h @@ -0,0 +1,59 @@ +#ifndef SIMDJSON_PADDED_STRING_VIEW_INL_H +#define SIMDJSON_PADDED_STRING_VIEW_INL_H + +#include "simdjson/padded_string_view.h" +#include "simdjson/error-inl.h" + +#include /* memcmp */ + +namespace simdjson { + +inline padded_string_view::padded_string_view(const char* s, size_t len, size_t capacity) noexcept + : std::string_view(s, len), _capacity(capacity) +{ + if(_capacity < len) { _capacity = len; } +} + +inline padded_string_view::padded_string_view(const uint8_t* s, size_t len, size_t capacity) noexcept + : padded_string_view(reinterpret_cast(s), len, capacity) +{ +} +#ifdef __cpp_char8_t +inline padded_string_view::padded_string_view(const char8_t* s, size_t len, size_t capacity) noexcept + : padded_string_view(reinterpret_cast(s), len, capacity) +{ +} +#endif +inline padded_string_view::padded_string_view(const std::string &s) noexcept + : std::string_view(s), _capacity(s.capacity()) +{ +} + +inline padded_string_view::padded_string_view(std::string_view s, size_t capacity) noexcept + : std::string_view(s), _capacity(capacity) +{ + if(_capacity < s.length()) { _capacity = s.length(); } +} + +inline size_t padded_string_view::capacity() const noexcept { return _capacity; } + +inline size_t padded_string_view::padding() const noexcept { return capacity() - length(); } + +inline bool padded_string_view::remove_utf8_bom() noexcept { + if(length() < 3) { return false; } + if (std::memcmp(data(), "\xEF\xBB\xBF", 3) == 0) { + remove_prefix(3); + _capacity -= 3; + return true; + } + return false; +} + +#if SIMDJSON_EXCEPTIONS +inline std::ostream& operator<<(std::ostream& out, simdjson_result &s) noexcept(false) { return out << s.value(); } +#endif + +} // namespace simdjson + + +#endif // SIMDJSON_PADDED_STRING_VIEW_INL_H diff --git a/contrib/libs/simdjson/include/simdjson/padded_string_view.h b/contrib/libs/simdjson/include/simdjson/padded_string_view.h new file mode 100644 index 000000000000..f8dc59a79cd0 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/padded_string_view.h @@ -0,0 +1,88 @@ +#ifndef SIMDJSON_PADDED_STRING_VIEW_H +#define SIMDJSON_PADDED_STRING_VIEW_H + +#include "simdjson/portability.h" +#include "simdjson/base.h" // for SIMDJSON_PADDING +#include "simdjson/error.h" + +#include +#include +#include +#include + +namespace simdjson { + +/** + * User-provided string that promises it has extra padded bytes at the end for use with parser::parse(). + */ +class padded_string_view : public std::string_view { +private: + size_t _capacity; + +public: + /** Create an empty padded_string_view. */ + inline padded_string_view() noexcept = default; + + /** + * Promise the given buffer has at least SIMDJSON_PADDING extra bytes allocated to it. + * + * @param s The string. + * @param len The length of the string (not including padding). + * @param capacity The allocated length of the string, including padding. If the capacity is less + * than the length, the capacity will be set to the length. + */ + explicit inline padded_string_view(const char* s, size_t len, size_t capacity) noexcept; + /** overload explicit inline padded_string_view(const char* s, size_t len) noexcept */ + explicit inline padded_string_view(const uint8_t* s, size_t len, size_t capacity) noexcept; +#ifdef __cpp_char8_t + explicit inline padded_string_view(const char8_t* s, size_t len, size_t capacity) noexcept; +#endif + /** + * Promise the given string has at least SIMDJSON_PADDING extra bytes allocated to it. + * + * The capacity of the string will be used to determine its padding. + * + * @param s The string. + */ + explicit inline padded_string_view(const std::string &s) noexcept; + + /** + * Promise the given string_view has at least SIMDJSON_PADDING extra bytes allocated to it. + * + * @param s The string. + * @param capacity The allocated length of the string, including padding. If the capacity is less + * than the length, the capacity will be set to the length. + */ + explicit inline padded_string_view(std::string_view s, size_t capacity) noexcept; + + /** The number of allocated bytes. */ + inline size_t capacity() const noexcept; + + /** + * Remove the UTF-8 Byte Order Mark (BOM) if it exists. + * + * @return whether a BOM was found and removed + */ + inline bool remove_utf8_bom() noexcept; + + /** The amount of padding on the string (capacity() - length()) */ + inline size_t padding() const noexcept; + +}; // padded_string_view + +#if SIMDJSON_EXCEPTIONS +/** + * Send padded_string instance to an output stream. + * + * @param out The output stream. + * @param s The padded_string_view. + * @throw simdjson_error if the result being printed has an error. If there is an error with the + * underlying output stream, that error will be propagated (simdjson_error will not be + * thrown). + */ +inline std::ostream& operator<<(std::ostream& out, simdjson_result &s) noexcept(false); +#endif + +} // namespace simdjson + +#endif // SIMDJSON_PADDED_STRING_VIEW_H diff --git a/contrib/libs/simdjson/include/simdjson/portability.h b/contrib/libs/simdjson/include/simdjson/portability.h new file mode 100644 index 000000000000..ff65aa9deb39 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/portability.h @@ -0,0 +1,204 @@ +#ifndef SIMDJSON_PORTABILITY_H +#define SIMDJSON_PORTABILITY_H + +#include +#include +#include +#include +#include +#ifndef _WIN32 +// strcasecmp, strncasecmp +#include +#endif + +// We are using size_t without namespace std:: throughout the project +using std::size_t; + +#ifdef _MSC_VER +#define SIMDJSON_VISUAL_STUDIO 1 +/** + * We want to differentiate carefully between + * clang under visual studio and regular visual + * studio. + * + * Under clang for Windows, we enable: + * * target pragmas so that part and only part of the + * code gets compiled for advanced instructions. + * + */ +#ifdef __clang__ +// clang under visual studio +#define SIMDJSON_CLANG_VISUAL_STUDIO 1 +#else +// just regular visual studio (best guess) +#define SIMDJSON_REGULAR_VISUAL_STUDIO 1 +#endif // __clang__ +#endif // _MSC_VER + +#if (defined(__x86_64__) || defined(_M_AMD64)) && !defined(_M_ARM64EC) +#define SIMDJSON_IS_X86_64 1 +#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) +#define SIMDJSON_IS_ARM64 1 +#elif defined(__riscv) && __riscv_xlen == 64 +#define SIMDJSON_IS_RISCV64 1 +#elif defined(__loongarch_lp64) +#define SIMDJSON_IS_LOONGARCH64 1 +#elif defined(__PPC64__) || defined(_M_PPC64) +#if defined(__ALTIVEC__) +#define SIMDJSON_IS_PPC64_VMX 1 +#endif // defined(__ALTIVEC__) +#else +#define SIMDJSON_IS_32BITS 1 + +#if defined(_M_IX86) || defined(__i386__) +#define SIMDJSON_IS_X86_32BITS 1 +#elif defined(__arm__) || defined(_M_ARM) +#define SIMDJSON_IS_ARM_32BITS 1 +#elif defined(__PPC__) || defined(_M_PPC) +#define SIMDJSON_IS_PPC_32BITS 1 +#endif + +#endif // defined(__x86_64__) || defined(_M_AMD64) +#ifndef SIMDJSON_IS_32BITS +#define SIMDJSON_IS_32BITS 0 +#endif + +#if SIMDJSON_IS_32BITS +#ifndef SIMDJSON_NO_PORTABILITY_WARNING +// In the future, we should allow programmers +// to get warning. +#endif // SIMDJSON_NO_PORTABILITY_WARNING +#endif // SIMDJSON_IS_32BITS + +#define SIMDJSON_CAT_IMPLEMENTATION_(a,...) a ## __VA_ARGS__ +#define SIMDJSON_CAT(a,...) SIMDJSON_CAT_IMPLEMENTATION_(a, __VA_ARGS__) + +#define SIMDJSON_STRINGIFY_IMPLEMENTATION_(a,...) #a SIMDJSON_STRINGIFY(__VA_ARGS__) +#define SIMDJSON_STRINGIFY(a,...) SIMDJSON_CAT_IMPLEMENTATION_(a, __VA_ARGS__) + +// this is almost standard? +#undef SIMDJSON_STRINGIFY_IMPLEMENTATION_ +#undef SIMDJSON_STRINGIFY +#define SIMDJSON_STRINGIFY_IMPLEMENTATION_(a) #a +#define SIMDJSON_STRINGIFY(a) SIMDJSON_STRINGIFY_IMPLEMENTATION_(a) + +// Our fast kernels require 64-bit systems. +// +// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions. +// Furthermore, the number of SIMD registers is reduced. +// +// On 32-bit ARM, we would have smaller registers. +// +// The simdjson users should still have the fallback kernel. It is +// slower, but it should run everywhere. + +// +// Enable valid runtime implementations, and select SIMDJSON_BUILTIN_IMPLEMENTATION +// + +// We are going to use runtime dispatch. +#if SIMDJSON_IS_X86_64 +#ifdef __clang__ +// clang does not have GCC push pop +// warning: clang attribute push can't be used within a namespace in clang up +// til 8.0 so SIMDJSON_TARGET_REGION and SIMDJSON_UNTARGET_REGION must be *outside* of a +// namespace. +#define SIMDJSON_TARGET_REGION(T) \ + _Pragma(SIMDJSON_STRINGIFY( \ + clang attribute push(__attribute__((target(T))), apply_to = function))) +#define SIMDJSON_UNTARGET_REGION _Pragma("clang attribute pop") +#elif defined(__GNUC__) +// GCC is easier +#define SIMDJSON_TARGET_REGION(T) \ + _Pragma("GCC push_options") _Pragma(SIMDJSON_STRINGIFY(GCC target(T))) +#define SIMDJSON_UNTARGET_REGION _Pragma("GCC pop_options") +#endif // clang then gcc + +#endif // x86 + +// Default target region macros don't do anything. +#ifndef SIMDJSON_TARGET_REGION +#define SIMDJSON_TARGET_REGION(T) +#define SIMDJSON_UNTARGET_REGION +#endif + +// Is threading enabled? +#if defined(_REENTRANT) || defined(_MT) +#ifndef SIMDJSON_THREADS_ENABLED +#define SIMDJSON_THREADS_ENABLED +#endif +#endif + +// workaround for large stack sizes under -O0. +// https://github.com/simdjson/simdjson/issues/691 +#ifdef __APPLE__ +#ifndef __OPTIMIZE__ +// Apple systems have small stack sizes in secondary threads. +// Lack of compiler optimization may generate high stack usage. +// Users may want to disable threads for safety, but only when +// in debug mode which we detect by the fact that the __OPTIMIZE__ +// macro is not defined. +#undef SIMDJSON_THREADS_ENABLED +#endif +#endif + + +#if defined(__clang__) +#define SIMDJSON_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize("undefined"))) +#elif defined(__GNUC__) +#define SIMDJSON_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize_undefined)) +#else +#define SIMDJSON_NO_SANITIZE_UNDEFINED +#endif + +#if defined(__clang__) || defined(__GNUC__) +#define simdjson_pure [[gnu::pure]] +#else +#define simdjson_pure +#endif + +#if defined(__clang__) || defined(__GNUC__) +#if defined(__has_feature) +# if __has_feature(memory_sanitizer) +#define SIMDJSON_NO_SANITIZE_MEMORY __attribute__((no_sanitize("memory"))) +# endif // if __has_feature(memory_sanitizer) +#endif // defined(__has_feature) +#endif +// make sure it is defined as 'nothing' if it is unapplicable. +#ifndef SIMDJSON_NO_SANITIZE_MEMORY +#define SIMDJSON_NO_SANITIZE_MEMORY +#endif + +#if SIMDJSON_VISUAL_STUDIO +// This is one case where we do not distinguish between +// regular visual studio and clang under visual studio. +// clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has) +#define simdjson_strcasecmp _stricmp +#define simdjson_strncasecmp _strnicmp +#else +// The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8). +// So they are only useful for ASCII in our context. +// https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings +#define simdjson_strcasecmp strcasecmp +#define simdjson_strncasecmp strncasecmp +#endif + +#if defined(NDEBUG) || defined(__OPTIMIZE__) || (defined(_MSC_VER) && !defined(_DEBUG)) +// If NDEBUG is set, or __OPTIMIZE__ is set, or we are under MSVC in release mode, +// then do away with asserts and use __assume. +#if SIMDJSON_VISUAL_STUDIO +#define SIMDJSON_UNREACHABLE() __assume(0) +#define SIMDJSON_ASSUME(COND) __assume(COND) +#else +#define SIMDJSON_UNREACHABLE() __builtin_unreachable(); +#define SIMDJSON_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0) +#endif + +#else // defined(NDEBUG) || defined(__OPTIMIZE__) || (defined(_MSC_VER) && !defined(_DEBUG)) +// This should only ever be enabled in debug mode. +#define SIMDJSON_UNREACHABLE() assert(0); +#define SIMDJSON_ASSUME(COND) assert(COND) + +#endif + +#endif // SIMDJSON_PORTABILITY_H diff --git a/contrib/libs/simdjson/include/simdjson/ppc64.h b/contrib/libs/simdjson/include/simdjson/ppc64.h new file mode 100644 index 000000000000..8f563cf8d86e --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_PPC64_H +#define SIMDJSON_PPC64_H + +#error #include "simdjson/ppc64/begin.h" +#include "simdjson/generic/amalgamated.h" +#error #include "simdjson/ppc64/end.h" + +#endif // SIMDJSON_PPC64_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/ppc64/base.h b/contrib/libs/simdjson/include/simdjson/ppc64/base.h new file mode 100644 index 000000000000..6e460dfb8e88 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64/base.h @@ -0,0 +1,26 @@ +#ifndef SIMDJSON_PPC64_BASE_H +#define SIMDJSON_PPC64_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +/** + * Implementation for ALTIVEC (PPC64). + */ +namespace ppc64 { + +class implementation; + +namespace { +namespace simd { +template struct simd8; +template struct simd8x64; +} // namespace simd +} // unnamed namespace + +} // namespace ppc64 +} // namespace simdjson + +#endif // SIMDJSON_PPC64_BASE_H diff --git a/contrib/libs/simdjson/include/simdjson/ppc64/begin.h b/contrib/libs/simdjson/include/simdjson/ppc64/begin.h new file mode 100644 index 000000000000..36a8d4299698 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64/begin.h @@ -0,0 +1,10 @@ +#define SIMDJSON_IMPLEMENTATION ppc64 +#error #include "simdjson/ppc64/base.h" +#error #include "simdjson/ppc64/intrinsics.h" +#error #include "simdjson/ppc64/bitmanipulation.h" +#error #include "simdjson/ppc64/bitmask.h" +#error #include "simdjson/ppc64/numberparsing_defs.h" +#error #include "simdjson/ppc64/simd.h" +#error #include "simdjson/ppc64/stringparsing_defs.h" + +#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1 \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/ppc64/bitmanipulation.h b/contrib/libs/simdjson/include/simdjson/ppc64/bitmanipulation.h new file mode 100644 index 000000000000..b2dda09ceadf --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64/bitmanipulation.h @@ -0,0 +1,78 @@ +#ifndef SIMDJSON_PPC64_BITMANIPULATION_H +#define SIMDJSON_PPC64_BITMANIPULATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#error #include "simdjson/ppc64/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace ppc64 { +namespace { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +SIMDJSON_NO_SANITIZE_UNDEFINED +// This function can be used safely even if not all bytes have been +// initialized. +// See issue https://github.com/simdjson/simdjson/issues/1965 +SIMDJSON_NO_SANITIZE_MEMORY +simdjson_inline int trailing_zeroes(uint64_t input_num) { +#if SIMDJSON_REGULAR_VISUAL_STUDIO + unsigned long ret; + // Search the mask data from least significant bit (LSB) + // to the most significant bit (MSB) for a set bit (1). + _BitScanForward64(&ret, input_num); + return (int)ret; +#else // SIMDJSON_REGULAR_VISUAL_STUDIO + return __builtin_ctzll(input_num); +#endif // SIMDJSON_REGULAR_VISUAL_STUDIO +} + +/* result might be undefined when input_num is zero */ +simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return input_num & (input_num - 1); +} + +/* result might be undefined when input_num is zero */ +simdjson_inline int leading_zeroes(uint64_t input_num) { +#if SIMDJSON_REGULAR_VISUAL_STUDIO + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + if (_BitScanReverse64(&leading_zero, input_num)) + return (int)(63 - leading_zero); + else + return 64; +#else + return __builtin_clzll(input_num); +#endif // SIMDJSON_REGULAR_VISUAL_STUDIO +} + +#if SIMDJSON_REGULAR_VISUAL_STUDIO +simdjson_inline int count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows in this kernel + return __popcnt64(input_num); // Visual Studio wants two underscores +} +#else +simdjson_inline int count_ones(uint64_t input_num) { + return __builtin_popcountll(input_num); +} +#endif + +simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, + uint64_t *result) { +#if SIMDJSON_REGULAR_VISUAL_STUDIO + *result = value1 + value2; + return *result < value1; +#else + return __builtin_uaddll_overflow(value1, value2, + reinterpret_cast(result)); +#endif +} + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdjson + +#endif // SIMDJSON_PPC64_BITMANIPULATION_H diff --git a/contrib/libs/simdjson/include/simdjson/ppc64/bitmask.h b/contrib/libs/simdjson/include/simdjson/ppc64/bitmask.h new file mode 100644 index 000000000000..25714ae133d4 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64/bitmask.h @@ -0,0 +1,46 @@ +#ifndef SIMDJSON_PPC64_BITMASK_H +#define SIMDJSON_PPC64_BITMASK_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#error #include "simdjson/ppc64/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace ppc64 { +namespace { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is +// encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +simdjson_inline uint64_t prefix_xor(uint64_t bitmask) { + // You can use the version below, however gcc sometimes miscompiles + // vec_pmsum_be, it happens somewhere around between 8 and 9th version. + // The performance boost was not noticeable, falling back to a usual + // implementation. + // __vector unsigned long long all_ones = {~0ull, ~0ull}; + // __vector unsigned long long mask = {bitmask, 0}; + // // Clang and GCC return different values for pmsum for ull so cast it to one. + // // Generally it is not specified by ALTIVEC ISA what is returned by + // // vec_pmsum_be. + // #if defined(__LITTLE_ENDIAN__) + // return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[0]); + // #else + // return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[1]); + // #endif + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + return bitmask; +} + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdjson + +#endif diff --git a/contrib/libs/simdjson/include/simdjson/ppc64/end.h b/contrib/libs/simdjson/include/simdjson/ppc64/end.h new file mode 100644 index 000000000000..701538b651d4 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64/end.h @@ -0,0 +1,6 @@ +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#error #include "simdjson/ppc64/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT +#undef SIMDJSON_IMPLEMENTATION diff --git a/contrib/libs/simdjson/include/simdjson/ppc64/implementation.h b/contrib/libs/simdjson/include/simdjson/ppc64/implementation.h new file mode 100644 index 000000000000..505581f0cd2e --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64/implementation.h @@ -0,0 +1,40 @@ +#ifndef SIMDJSON_PPC64_IMPLEMENTATION_H +#define SIMDJSON_PPC64_IMPLEMENTATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#error #include "simdjson/ppc64/base.h" +#include "simdjson/implementation.h" +#include "simdjson/internal/instruction_set.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { + +/** + * Implementation for ALTIVEC (PPC64). + */ +namespace ppc64 { + +/** + * @private + */ +class implementation final : public simdjson::implementation { +public: + simdjson_inline implementation() + : simdjson::implementation("ppc64", "PPC64 ALTIVEC", + internal::instruction_set::ALTIVEC) {} + + simdjson_warn_unused error_code create_dom_parser_implementation( + size_t capacity, size_t max_length, + std::unique_ptr &dst) + const noexcept final; + simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, + uint8_t *dst, + size_t &dst_len) const noexcept final; + simdjson_warn_unused bool validate_utf8(const char *buf, + size_t len) const noexcept final; +}; + +} // namespace ppc64 +} // namespace simdjson + +#endif // SIMDJSON_PPC64_IMPLEMENTATION_H diff --git a/contrib/libs/simdjson/include/simdjson/ppc64/intrinsics.h b/contrib/libs/simdjson/include/simdjson/ppc64/intrinsics.h new file mode 100644 index 000000000000..96d90eaae3c0 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64/intrinsics.h @@ -0,0 +1,23 @@ +#ifndef SIMDJSON_PPC64_INTRINSICS_H +#define SIMDJSON_PPC64_INTRINSICS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#error #include "simdjson/ppc64/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// This should be the correct header whether +// you use visual studio or other compilers. +#include + +// These are defined by altivec.h in GCC toolchain, it is safe to undef them. +#ifdef bool +#undef bool +#endif + +#ifdef vector +#undef vector +#endif + +static_assert(sizeof(__vector unsigned char) <= simdjson::SIMDJSON_PADDING, "insufficient padding for ppc64"); + +#endif // SIMDJSON_PPC64_INTRINSICS_H diff --git a/contrib/libs/simdjson/include/simdjson/ppc64/numberparsing_defs.h b/contrib/libs/simdjson/include/simdjson/ppc64/numberparsing_defs.h new file mode 100644 index 000000000000..5ec2f60a6e57 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64/numberparsing_defs.h @@ -0,0 +1,65 @@ +#ifndef SIMDJSON_PPC64_NUMBERPARSING_DEFS_H +#define SIMDJSON_PPC64_NUMBERPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#error #include "simdjson/ppc64/base.h" +#error #include "simdjson/ppc64/intrinsics.h" +#include "simdjson/internal/numberparsing_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include + +#if defined(__linux__) +#include +#elif defined(__FreeBSD__) +#include +#endif + +namespace simdjson { +namespace ppc64 { +namespace numberparsing { + +// we don't have appropriate instructions, so let us use a scalar function +// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/ +/** @private */ +static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) { + uint64_t val; + std::memcpy(&val, chars, sizeof(uint64_t)); +#ifdef __BIG_ENDIAN__ +#if defined(__linux__) + val = bswap_64(val); +#elif defined(__FreeBSD__) + val = bswap64(val); +#endif +#endif + val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8; + val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16; + return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32); +} + +/** @private */ +simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) { + internal::value128 answer; +#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS +#if SIMDJSON_IS_ARM64 + // ARM64 has native support for 64-bit multiplications, no need to emultate + answer.high = __umulh(value1, value2); + answer.low = value1 * value2; +#else + answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64 +#endif // SIMDJSON_IS_ARM64 +#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS + __uint128_t r = (static_cast<__uint128_t>(value1)) * value2; + answer.low = uint64_t(r); + answer.high = uint64_t(r >> 64); +#endif + return answer; +} + +} // namespace numberparsing +} // namespace ppc64 +} // namespace simdjson + +#define SIMDJSON_SWAR_NUMBER_PARSING 1 + +#endif // SIMDJSON_PPC64_NUMBERPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/ppc64/ondemand.h b/contrib/libs/simdjson/include/simdjson/ppc64/ondemand.h new file mode 100644 index 000000000000..a9df4b5971c3 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64/ondemand.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_PPC64_ONDEMAND_H +#define SIMDJSON_PPC64_ONDEMAND_H + +#error #include "simdjson/ppc64/begin.h" +#include "simdjson/generic/ondemand/amalgamated.h" +#error #include "simdjson/ppc64/end.h" + +#endif // SIMDJSON_PPC64_ONDEMAND_H diff --git a/contrib/libs/simdjson/include/simdjson/ppc64/simd.h b/contrib/libs/simdjson/include/simdjson/ppc64/simd.h new file mode 100644 index 000000000000..d20fe9c6f3ce --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64/simd.h @@ -0,0 +1,472 @@ +#ifndef SIMDJSON_PPC64_SIMD_H +#define SIMDJSON_PPC64_SIMD_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#error #include "simdjson/ppc64/base.h" +#error #include "simdjson/ppc64/bitmanipulation.h" +#include "simdjson/internal/simdprune_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include + +namespace simdjson { +namespace ppc64 { +namespace { +namespace simd { + +using __m128i = __vector unsigned char; + +template struct base { + __m128i value; + + // Zero constructor + simdjson_inline base() : value{__m128i()} {} + + // Conversion from SIMD register + simdjson_inline base(const __m128i _value) : value(_value) {} + + // Conversion to SIMD register + simdjson_inline operator const __m128i &() const { + return this->value; + } + simdjson_inline operator __m128i &() { return this->value; } + + // Bit operations + simdjson_inline Child operator|(const Child other) const { + return vec_or(this->value, (__m128i)other); + } + simdjson_inline Child operator&(const Child other) const { + return vec_and(this->value, (__m128i)other); + } + simdjson_inline Child operator^(const Child other) const { + return vec_xor(this->value, (__m128i)other); + } + simdjson_inline Child bit_andnot(const Child other) const { + return vec_andc(this->value, (__m128i)other); + } + simdjson_inline Child &operator|=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast | other; + return *this_cast; + } + simdjson_inline Child &operator&=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast & other; + return *this_cast; + } + simdjson_inline Child &operator^=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast ^ other; + return *this_cast; + } +}; + +template > +struct base8 : base> { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + simdjson_inline base8() : base>() {} + simdjson_inline base8(const __m128i _value) : base>(_value) {} + + friend simdjson_inline Mask operator==(const simd8 lhs, const simd8 rhs) { + return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs); + } + + static const int SIZE = sizeof(base>::value); + + template + simdjson_inline simd8 prev(simd8 prev_chunk) const { + __m128i chunk = this->value; +#ifdef __LITTLE_ENDIAN__ + chunk = (__m128i)vec_reve(this->value); + prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk); +#endif + chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N); +#ifdef __LITTLE_ENDIAN__ + chunk = (__m128i)vec_reve((__m128i)chunk); +#endif + return chunk; + } +}; + +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd8 : base8 { + static simdjson_inline simd8 splat(bool _value) { + return (__m128i)vec_splats((unsigned char)(-(!!_value))); + } + + simdjson_inline simd8() : base8() {} + simdjson_inline simd8(const __m128i _value) + : base8(_value) {} + // Splat constructor + simdjson_inline simd8(bool _value) + : base8(splat(_value)) {} + + simdjson_inline int to_bitmask() const { + __vector unsigned long long result; + const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, + 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00}; + + result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value, + (__m128i)perm_mask)); +#ifdef __LITTLE_ENDIAN__ + return static_cast(result[1]); +#else + return static_cast(result[0]); +#endif + } + simdjson_inline bool any() const { + return !vec_all_eq(this->value, (__m128i)vec_splats(0)); + } + simdjson_inline simd8 operator~() const { + return this->value ^ (__m128i)splat(true); + } +}; + +template struct base8_numeric : base8 { + static simdjson_inline simd8 splat(T value) { + (void)value; + return (__m128i)vec_splats(value); + } + static simdjson_inline simd8 zero() { return splat(0); } + static simdjson_inline simd8 load(const T values[16]) { + return (__m128i)(vec_vsx_ld(0, reinterpret_cast(values))); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdjson_inline simd8 repeat_16(T v0, T v1, T v2, T v3, T v4, + T v5, T v6, T v7, T v8, T v9, + T v10, T v11, T v12, T v13, + T v14, T v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15); + } + + simdjson_inline base8_numeric() : base8() {} + simdjson_inline base8_numeric(const __m128i _value) + : base8(_value) {} + + // Store to array + simdjson_inline void store(T dst[16]) const { + vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst)); + } + + // Override to distinguish from bool version + simdjson_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdjson_inline simd8 operator+(const simd8 other) const { + return (__m128i)((__m128i)this->value + (__m128i)other); + } + simdjson_inline simd8 operator-(const simd8 other) const { + return (__m128i)((__m128i)this->value - (__m128i)other); + } + simdjson_inline simd8 &operator+=(const simd8 other) { + *this = *this + other; + return *static_cast *>(this); + } + simdjson_inline simd8 &operator-=(const simd8 other) { + *this = *this - other; + return *static_cast *>(this); + } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior + // for out of range values) + template + simdjson_inline simd8 lookup_16(simd8 lookup_table) const { + return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value); + } + + // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted + // as a bitset). Passing a 0 value for mask would be equivalent to writing out + // every byte to output. Only the first 16 - count_ones(mask) bytes of the + // result are significant but 16 bytes get written. Design consideration: it + // seems like a function with the signature simd8 compress(uint32_t mask) + // would be sensible, but the AVX ISA makes this kind of approach difficult. + template + simdjson_inline void compress(uint16_t mask, L *output) const { + using internal::BitsSetTable256mul2; + using internal::pshufb_combine_table; + using internal::thintable_epi8; + // this particular implementation was inspired by work done by @animetosho + // we do it in two steps, first 8 bytes and then second 8 bytes + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. +#ifdef __LITTLE_ENDIAN__ + __m128i shufmask = (__m128i)(__vector unsigned long long){ + thintable_epi8[mask1], thintable_epi8[mask2]}; +#else + __m128i shufmask = (__m128i)(__vector unsigned long long){ + thintable_epi8[mask2], thintable_epi8[mask1]}; + shufmask = (__m128i)vec_reve((__m128i)shufmask); +#endif + // we increment by 0x08 the second half of the mask + shufmask = ((__m128i)shufmask) + + ((__m128i)(__vector int){0, 0, 0x08080808, 0x08080808}); + + // this is the version "nearly pruned" + __m128i pruned = vec_perm(this->value, this->value, shufmask); + // we still need to put the two halves together. + // we compute the popcount of the first half: + int pop1 = BitsSetTable256mul2[mask1]; + // then load the corresponding mask, what it does is to write + // only the first pop1 bytes from the first 8 bytes, and then + // it fills in with the bytes from the second 8 bytes + some filling + // at the end. + __m128i compactmask = + vec_vsx_ld(0, reinterpret_cast(pshufb_combine_table + pop1 * 8)); + __m128i answer = vec_perm(pruned, (__m128i)vec_splats(0), compactmask); + vec_vsx_st(answer, 0, reinterpret_cast<__m128i *>(output)); + } + + template + simdjson_inline simd8 + lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4, + L replace5, L replace6, L replace7, L replace8, L replace9, + L replace10, L replace11, L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } +}; + +// Signed bytes +template <> struct simd8 : base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m128i _value) + : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const int8_t *values) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) + : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, + v15}) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 + repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, + int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } + + // Order-sensitive comparisons + simdjson_inline simd8 + max_val(const simd8 other) const { + return (__m128i)vec_max((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } + simdjson_inline simd8 + min_val(const simd8 other) const { + return (__m128i)vec_min((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } + simdjson_inline simd8 + operator>(const simd8 other) const { + return (__m128i)vec_cmpgt((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } + simdjson_inline simd8 + operator<(const simd8 other) const { + return (__m128i)vec_cmplt((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } +}; + +// Unsigned bytes +template <> struct simd8 : base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m128i _value) + : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const uint8_t *values) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline + simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, + uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) + : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15}) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 + repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, + uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, + uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, + uint8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } + + // Saturated math + simdjson_inline simd8 + saturating_add(const simd8 other) const { + return (__m128i)vec_adds(this->value, (__m128i)other); + } + simdjson_inline simd8 + saturating_sub(const simd8 other) const { + return (__m128i)vec_subs(this->value, (__m128i)other); + } + + // Order-specific operations + simdjson_inline simd8 + max_val(const simd8 other) const { + return (__m128i)vec_max(this->value, (__m128i)other); + } + simdjson_inline simd8 + min_val(const simd8 other) const { + return (__m128i)vec_min(this->value, (__m128i)other); + } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 + gt_bits(const simd8 other) const { + return this->saturating_sub(other); + } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 + lt_bits(const simd8 other) const { + return other.saturating_sub(*this); + } + simdjson_inline simd8 + operator<=(const simd8 other) const { + return other.max_val(*this) == other; + } + simdjson_inline simd8 + operator>=(const simd8 other) const { + return other.min_val(*this) == other; + } + simdjson_inline simd8 + operator>(const simd8 other) const { + return this->gt_bits(other).any_bits_set(); + } + simdjson_inline simd8 + operator<(const simd8 other) const { + return this->gt_bits(other).any_bits_set(); + } + + // Bit-specific operations + simdjson_inline simd8 bits_not_set() const { + return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0))); + } + simdjson_inline simd8 bits_not_set(simd8 bits) const { + return (*this & bits).bits_not_set(); + } + simdjson_inline simd8 any_bits_set() const { + return ~this->bits_not_set(); + } + simdjson_inline simd8 any_bits_set(simd8 bits) const { + return ~this->bits_not_set(bits); + } + simdjson_inline bool bits_not_set_anywhere() const { + return vec_all_eq(this->value, (__m128i)vec_splats(0)); + } + simdjson_inline bool any_bits_set_anywhere() const { + return !bits_not_set_anywhere(); + } + simdjson_inline bool bits_not_set_anywhere(simd8 bits) const { + return vec_all_eq(vec_and(this->value, (__m128i)bits), + (__m128i)vec_splats(0)); + } + simdjson_inline bool any_bits_set_anywhere(simd8 bits) const { + return !bits_not_set_anywhere(bits); + } + template simdjson_inline simd8 shr() const { + return simd8( + (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N))); + } + template simdjson_inline simd8 shl() const { + return simd8( + (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N))); + } +}; + +template struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, + "PPC64 kernel should use four registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64 &o) = delete; // no copy allowed + simd8x64 & + operator=(const simd8& other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdjson_inline simd8x64(const simd8 chunk0, const simd8 chunk1, + const simd8 chunk2, const simd8 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdjson_inline simd8x64(const T ptr[64]) + : chunks{simd8::load(ptr), simd8::load(ptr + 16), + simd8::load(ptr + 32), simd8::load(ptr + 48)} {} + + simdjson_inline void store(T ptr[64]) const { + this->chunks[0].store(ptr + sizeof(simd8) * 0); + this->chunks[1].store(ptr + sizeof(simd8) * 1); + this->chunks[2].store(ptr + sizeof(simd8) * 2); + this->chunks[3].store(ptr + sizeof(simd8) * 3); + } + + simdjson_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); + } + + simdjson_inline uint64_t compress(uint64_t mask, T *output) const { + this->chunks[0].compress(uint16_t(mask), output); + this->chunks[1].compress(uint16_t(mask >> 16), + output + 16 - count_ones(mask & 0xFFFF)); + this->chunks[2].compress(uint16_t(mask >> 32), + output + 32 - count_ones(mask & 0xFFFFFFFF)); + this->chunks[3].compress(uint16_t(mask >> 48), + output + 48 - count_ones(mask & 0xFFFFFFFFFFFF)); + return 64 - count_ones(mask); + } + + simdjson_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r1 = this->chunks[1].to_bitmask(); + uint64_t r2 = this->chunks[2].to_bitmask(); + uint64_t r3 = this->chunks[3].to_bitmask(); + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); + } + + simdjson_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] == mask, this->chunks[1] == mask, + this->chunks[2] == mask, this->chunks[3] == mask) + .to_bitmask(); + } + + simdjson_inline uint64_t eq(const simd8x64 &other) const { + return simd8x64(this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1], + this->chunks[2] == other.chunks[2], + this->chunks[3] == other.chunks[3]) + .to_bitmask(); + } + + simdjson_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] <= mask, this->chunks[1] <= mask, + this->chunks[2] <= mask, this->chunks[3] <= mask) + .to_bitmask(); + } +}; // struct simd8x64 + +} // namespace simd +} // unnamed namespace +} // namespace ppc64 +} // namespace simdjson + +#endif // SIMDJSON_PPC64_SIMD_INPUT_H diff --git a/contrib/libs/simdjson/include/simdjson/ppc64/stringparsing_defs.h b/contrib/libs/simdjson/include/simdjson/ppc64/stringparsing_defs.h new file mode 100644 index 000000000000..b1bd46e521a9 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/ppc64/stringparsing_defs.h @@ -0,0 +1,65 @@ +#ifndef SIMDJSON_PPC64_STRINGPARSING_DEFS_H +#define SIMDJSON_PPC64_STRINGPARSING_DEFS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#error #include "simdjson/ppc64/base.h" +#error #include "simdjson/ppc64/bitmanipulation.h" +#error #include "simdjson/ppc64/simd.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace ppc64 { +namespace { + +using namespace simd; + +// Holds backslashes and quotes locations. +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 32; + simdjson_inline static backslash_and_quote + copy_and_find(const uint8_t *src, uint8_t *dst); + + simdjson_inline bool has_quote_first() { + return ((bs_bits - 1) & quote_bits) != 0; + } + simdjson_inline bool has_backslash() { return bs_bits != 0; } + simdjson_inline int quote_index() { + return trailing_zeroes(quote_bits); + } + simdjson_inline int backslash_index() { + return trailing_zeroes(bs_bits); + } + + uint32_t bs_bits; + uint32_t quote_bits; +}; // struct backslash_and_quote + +simdjson_inline backslash_and_quote +backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { + // this can read up to 31 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), + "backslash and quote finder must process fewer than " + "SIMDJSON_PADDING bytes"); + simd8 v0(src); + simd8 v1(src + sizeof(v0)); + v0.store(dst); + v1.store(dst + sizeof(v0)); + + // Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on + // PPC; therefore, we smash them together into a 64-byte mask and get the + // bitmask from there. + uint64_t bs_and_quote = + simd8x64(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask(); + return { + uint32_t(bs_and_quote), // bs_bits + uint32_t(bs_and_quote >> 32) // quote_bits + }; +} + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdjson + +#endif // SIMDJSON_PPC64_STRINGPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/simdjson.h b/contrib/libs/simdjson/include/simdjson/simdjson.h new file mode 100644 index 000000000000..551673aefb02 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/simdjson.h @@ -0,0 +1,11 @@ +/** + * @file + * @deprecated We will be removing this file so it is not confused with the top level simdjson.h + */ +#ifndef SIMDJSON_SIMDJSON_H +#define SIMDJSON_SIMDJSON_H + +#include "simdjson/compiler_check.h" +#include "simdjson/error.h" + +#endif // SIMDJSON_SIMDJSON_H diff --git a/contrib/libs/simdjson/include/simdjson/simdjson_version.h b/contrib/libs/simdjson/include/simdjson/simdjson_version.h new file mode 100644 index 000000000000..059bb9a5017c --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/simdjson_version.h @@ -0,0 +1,26 @@ +// /include/simdjson/simdjson_version.h automatically generated by release.py, +// do not change by hand +#ifndef SIMDJSON_SIMDJSON_VERSION_H +#define SIMDJSON_SIMDJSON_VERSION_H + +/** The version of simdjson being used (major.minor.revision) */ +#define SIMDJSON_VERSION "3.10.1" + +namespace simdjson { +enum { + /** + * The major version (MAJOR.minor.revision) of simdjson being used. + */ + SIMDJSON_VERSION_MAJOR = 3, + /** + * The minor version (major.MINOR.revision) of simdjson being used. + */ + SIMDJSON_VERSION_MINOR = 10, + /** + * The revision (major.minor.REVISION) of simdjson being used. + */ + SIMDJSON_VERSION_REVISION = 1 +}; +} // namespace simdjson + +#endif // SIMDJSON_SIMDJSON_VERSION_H diff --git a/contrib/libs/simdjson/include/simdjson/westmere.h b/contrib/libs/simdjson/include/simdjson/westmere.h new file mode 100644 index 000000000000..f05ba1145c0b --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_WESTMERE_H +#define SIMDJSON_WESTMERE_H + +#include "simdjson/westmere/begin.h" +#include "simdjson/generic/amalgamated.h" +#include "simdjson/westmere/end.h" + +#endif // SIMDJSON_WESTMERE_H \ No newline at end of file diff --git a/contrib/libs/simdjson/include/simdjson/westmere/base.h b/contrib/libs/simdjson/include/simdjson/westmere/base.h new file mode 100644 index 000000000000..82ad333a8dcb --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere/base.h @@ -0,0 +1,29 @@ +#ifndef SIMDJSON_WESTMERE_BASE_H +#define SIMDJSON_WESTMERE_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_WESTMERE +namespace simdjson { +/** + * Implementation for Westmere (Intel SSE4.2). + */ +namespace westmere { + +class implementation; + +namespace { +namespace simd { + +template struct simd8; +template struct simd8x64; + +} // namespace simd +} // unnamed namespace + +} // namespace westmere +} // namespace simdjson + +#endif // SIMDJSON_WESTMERE_BASE_H diff --git a/contrib/libs/simdjson/include/simdjson/westmere/begin.h b/contrib/libs/simdjson/include/simdjson/westmere/begin.h new file mode 100644 index 000000000000..d807e6d9995c --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere/begin.h @@ -0,0 +1,13 @@ +#define SIMDJSON_IMPLEMENTATION westmere +#include "simdjson/westmere/base.h" +#include "simdjson/westmere/intrinsics.h" + +#if !SIMDJSON_CAN_ALWAYS_RUN_WESTMERE +SIMDJSON_TARGET_REGION("sse4.2,pclmul,popcnt") +#endif + +#include "simdjson/westmere/bitmanipulation.h" +#include "simdjson/westmere/bitmask.h" +#include "simdjson/westmere/numberparsing_defs.h" +#include "simdjson/westmere/simd.h" +#include "simdjson/westmere/stringparsing_defs.h" diff --git a/contrib/libs/simdjson/include/simdjson/westmere/bitmanipulation.h b/contrib/libs/simdjson/include/simdjson/westmere/bitmanipulation.h new file mode 100644 index 000000000000..7cd29a406a71 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere/bitmanipulation.h @@ -0,0 +1,79 @@ +#ifndef SIMDJSON_WESTMERE_BITMANIPULATION_H +#define SIMDJSON_WESTMERE_BITMANIPULATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/westmere/base.h" +#include "simdjson/westmere/intrinsics.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace westmere { +namespace { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +SIMDJSON_NO_SANITIZE_UNDEFINED +// This function can be used safely even if not all bytes have been +// initialized. +// See issue https://github.com/simdjson/simdjson/issues/1965 +SIMDJSON_NO_SANITIZE_MEMORY +simdjson_inline int trailing_zeroes(uint64_t input_num) { +#if SIMDJSON_REGULAR_VISUAL_STUDIO + unsigned long ret; + // Search the mask data from least significant bit (LSB) + // to the most significant bit (MSB) for a set bit (1). + _BitScanForward64(&ret, input_num); + return (int)ret; +#else // SIMDJSON_REGULAR_VISUAL_STUDIO + return __builtin_ctzll(input_num); +#endif // SIMDJSON_REGULAR_VISUAL_STUDIO +} + +/* result might be undefined when input_num is zero */ +simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return input_num & (input_num-1); +} + +/* result might be undefined when input_num is zero */ +simdjson_inline int leading_zeroes(uint64_t input_num) { +#if SIMDJSON_REGULAR_VISUAL_STUDIO + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + if (_BitScanReverse64(&leading_zero, input_num)) + return (int)(63 - leading_zero); + else + return 64; +#else + return __builtin_clzll(input_num); +#endif// SIMDJSON_REGULAR_VISUAL_STUDIO +} + +#if SIMDJSON_REGULAR_VISUAL_STUDIO +simdjson_inline unsigned __int64 count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows in this kernel + return __popcnt64(input_num);// Visual Studio wants two underscores +} +#else +simdjson_inline long long int count_ones(uint64_t input_num) { + return _popcnt64(input_num); +} +#endif + +simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, + uint64_t *result) { +#if SIMDJSON_REGULAR_VISUAL_STUDIO + return _addcarry_u64(0, value1, value2, + reinterpret_cast(result)); +#else + return __builtin_uaddll_overflow(value1, value2, + reinterpret_cast(result)); +#endif +} + +} // unnamed namespace +} // namespace westmere +} // namespace simdjson + +#endif // SIMDJSON_WESTMERE_BITMANIPULATION_H diff --git a/contrib/libs/simdjson/include/simdjson/westmere/bitmask.h b/contrib/libs/simdjson/include/simdjson/westmere/bitmask.h new file mode 100644 index 000000000000..cd79b724119d --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere/bitmask.h @@ -0,0 +1,30 @@ +#ifndef SIMDJSON_WESTMERE_BITMASK_H +#define SIMDJSON_WESTMERE_BITMASK_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/westmere/base.h" +#include "simdjson/westmere/intrinsics.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace westmere { +namespace { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) { + // There should be no such thing with a processing supporting avx2 + // but not clmul. + __m128i all_ones = _mm_set1_epi8('\xFF'); + __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0); + return _mm_cvtsi128_si64(result); +} + +} // unnamed namespace +} // namespace westmere +} // namespace simdjson + +#endif // SIMDJSON_WESTMERE_BITMASK_H diff --git a/contrib/libs/simdjson/include/simdjson/westmere/end.h b/contrib/libs/simdjson/include/simdjson/westmere/end.h new file mode 100644 index 000000000000..bd4a94049033 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere/end.h @@ -0,0 +1,9 @@ +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/westmere/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#if !SIMDJSON_CAN_ALWAYS_RUN_WESTMERE +SIMDJSON_UNTARGET_REGION +#endif + +#undef SIMDJSON_IMPLEMENTATION diff --git a/contrib/libs/simdjson/include/simdjson/westmere/implementation.h b/contrib/libs/simdjson/include/simdjson/westmere/implementation.h new file mode 100644 index 000000000000..37392be2ae9f --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere/implementation.h @@ -0,0 +1,32 @@ +#ifndef SIMDJSON_WESTMERE_IMPLEMENTATION_H +#define SIMDJSON_WESTMERE_IMPLEMENTATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/westmere/base.h" +#include "simdjson/implementation.h" +#include "simdjson/internal/instruction_set.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_WESTMERE +namespace simdjson { +namespace westmere { + +/** + * @private + */ +class implementation final : public simdjson::implementation { +public: + simdjson_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42 | internal::instruction_set::PCLMULQDQ) {} + simdjson_warn_unused error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final; + simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; +}; + +} // namespace westmere +} // namespace simdjson + +#endif // SIMDJSON_WESTMERE_IMPLEMENTATION_H diff --git a/contrib/libs/simdjson/include/simdjson/westmere/intrinsics.h b/contrib/libs/simdjson/include/simdjson/westmere/intrinsics.h new file mode 100644 index 000000000000..63a351c80900 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere/intrinsics.h @@ -0,0 +1,31 @@ +#ifndef SIMDJSON_WESTMERE_INTRINSICS_H +#define SIMDJSON_WESTMERE_INTRINSICS_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/westmere/base.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#if SIMDJSON_VISUAL_STUDIO +// under clang within visual studio, this will include +#include // visual studio or clang +#else +#include // elsewhere +#endif // SIMDJSON_VISUAL_STUDIO + + +#if SIMDJSON_CLANG_VISUAL_STUDIO +/** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + */ +#include // for _mm_alignr_epi8 +#include // for _mm_clmulepi64_si128 +#endif + +static_assert(sizeof(__m128i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for westmere"); + +#endif // SIMDJSON_WESTMERE_INTRINSICS_H diff --git a/contrib/libs/simdjson/include/simdjson/westmere/numberparsing_defs.h b/contrib/libs/simdjson/include/simdjson/westmere/numberparsing_defs.h new file mode 100644 index 000000000000..05cfccfd10d4 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere/numberparsing_defs.h @@ -0,0 +1,59 @@ +#ifndef SIMDJSON_WESTMERE_NUMBERPARSING_DEFS_H +#define SIMDJSON_WESTMERE_NUMBERPARSING_DEFS_H + +#include "simdjson/westmere/base.h" +#include "simdjson/westmere/intrinsics.h" + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/internal/numberparsing_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace westmere { +namespace numberparsing { + +/** @private */ +static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) { + // this actually computes *16* values so we are being wasteful. + const __m128i ascii0 = _mm_set1_epi8('0'); + const __m128i mul_1_10 = + _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1); + const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1); + const __m128i mul_1_10000 = + _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1); + const __m128i input = _mm_sub_epi8( + _mm_loadu_si128(reinterpret_cast(chars)), ascii0); + const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10); + const __m128i t2 = _mm_madd_epi16(t1, mul_1_100); + const __m128i t3 = _mm_packus_epi32(t2, t2); + const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000); + return _mm_cvtsi128_si32( + t4); // only captures the sum of the first 8 digits, drop the rest +} + +/** @private */ +simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) { + internal::value128 answer; +#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS +#if SIMDJSON_IS_ARM64 + // ARM64 has native support for 64-bit multiplications, no need to emultate + answer.high = __umulh(value1, value2); + answer.low = value1 * value2; +#else + answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64 +#endif // SIMDJSON_IS_ARM64 +#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS + __uint128_t r = (static_cast<__uint128_t>(value1)) * value2; + answer.low = uint64_t(r); + answer.high = uint64_t(r >> 64); +#endif + return answer; +} + +} // namespace numberparsing +} // namespace westmere +} // namespace simdjson + +#define SIMDJSON_SWAR_NUMBER_PARSING 1 + +#endif // SIMDJSON_WESTMERE_NUMBERPARSING_DEFS_H diff --git a/contrib/libs/simdjson/include/simdjson/westmere/ondemand.h b/contrib/libs/simdjson/include/simdjson/westmere/ondemand.h new file mode 100644 index 000000000000..40d4ce2551b3 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere/ondemand.h @@ -0,0 +1,8 @@ +#ifndef SIMDJSON_WESTMERE_ONDEMAND_H +#define SIMDJSON_WESTMERE_ONDEMAND_H + +#include "simdjson/westmere/begin.h" +#include "simdjson/generic/ondemand/amalgamated.h" +#include "simdjson/westmere/end.h" + +#endif // SIMDJSON_WESTMERE_IMPLEMENTATION_H diff --git a/contrib/libs/simdjson/include/simdjson/westmere/simd.h b/contrib/libs/simdjson/include/simdjson/westmere/simd.h new file mode 100644 index 000000000000..28329d9642eb --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere/simd.h @@ -0,0 +1,338 @@ +#ifndef SIMDJSON_WESTMERE_SIMD_H +#define SIMDJSON_WESTMERE_SIMD_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include "simdjson/westmere/base.h" +#include "simdjson/westmere/bitmanipulation.h" +#include "simdjson/internal/simdprune_tables.h" +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace westmere { +namespace { +namespace simd { + + template + struct base { + __m128i value; + + // Zero constructor + simdjson_inline base() : value{__m128i()} {} + + // Conversion from SIMD register + simdjson_inline base(const __m128i _value) : value(_value) {} + + // Conversion to SIMD register + simdjson_inline operator const __m128i&() const { return this->value; } + simdjson_inline operator __m128i&() { return this->value; } + + // Bit operations + simdjson_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); } + simdjson_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); } + simdjson_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); } + simdjson_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); } + simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast | other; return *this_cast; } + simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast & other; return *this_cast; } + simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast ^ other; return *this_cast; } + }; + + template> + struct base8: base> { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + simdjson_inline base8() : base>() {} + simdjson_inline base8(const __m128i _value) : base>(_value) {} + + friend simdjson_inline Mask operator==(const simd8 lhs, const simd8 rhs) { return _mm_cmpeq_epi8(lhs, rhs); } + + static const int SIZE = sizeof(base>::value); + + template + simdjson_inline simd8 prev(const simd8 prev_chunk) const { + return _mm_alignr_epi8(*this, prev_chunk, 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base8 { + static simdjson_inline simd8 splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); } + + simdjson_inline simd8() : base8() {} + simdjson_inline simd8(const __m128i _value) : base8(_value) {} + // Splat constructor + simdjson_inline simd8(bool _value) : base8(splat(_value)) {} + + simdjson_inline int to_bitmask() const { return _mm_movemask_epi8(*this); } + simdjson_inline bool any() const { return !_mm_testz_si128(*this, *this); } + simdjson_inline simd8 operator~() const { return *this ^ true; } + }; + + template + struct base8_numeric: base8 { + static simdjson_inline simd8 splat(T _value) { return _mm_set1_epi8(_value); } + static simdjson_inline simd8 zero() { return _mm_setzero_si128(); } + static simdjson_inline simd8 load(const T values[16]) { + return _mm_loadu_si128(reinterpret_cast(values)); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdjson_inline simd8 repeat_16( + T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + simdjson_inline base8_numeric() : base8() {} + simdjson_inline base8_numeric(const __m128i _value) : base8(_value) {} + + // Store to array + simdjson_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); } + + // Override to distinguish from bool version + simdjson_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdjson_inline simd8 operator+(const simd8 other) const { return _mm_add_epi8(*this, other); } + simdjson_inline simd8 operator-(const simd8 other) const { return _mm_sub_epi8(*this, other); } + simdjson_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *static_cast*>(this); } + simdjson_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *static_cast*>(this); } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdjson_inline simd8 lookup_16(simd8 lookup_table) const { + return _mm_shuffle_epi8(lookup_table, *this); + } + + // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). + // Passing a 0 value for mask would be equivalent to writing out every byte to output. + // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes + // get written. + // Design consideration: it seems like a function with the + // signature simd8 compress(uint32_t mask) would be + // sensible, but the AVX ISA makes this kind of approach difficult. + template + simdjson_inline void compress(uint16_t mask, L * output) const { + using internal::thintable_epi8; + using internal::BitsSetTable256mul2; + using internal::pshufb_combine_table; + // this particular implementation was inspired by work done by @animetosho + // we do it in two steps, first 8 bytes and then second 8 bytes + uint8_t mask1 = uint8_t(mask); // least significant 8 bits + uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits + // next line just loads the 64-bit values thintable_epi8[mask1] and + // thintable_epi8[mask2] into a 128-bit register, using only + // two instructions on most compilers. + __m128i shufmask = _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]); + // we increment by 0x08 the second half of the mask + shufmask = + _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); + // this is the version "nearly pruned" + __m128i pruned = _mm_shuffle_epi8(*this, shufmask); + // we still need to put the two halves together. + // we compute the popcount of the first half: + int pop1 = BitsSetTable256mul2[mask1]; + // then load the corresponding mask, what it does is to write + // only the first pop1 bytes from the first 8 bytes, and then + // it fills in with the bytes from the second 8 bytes + some filling + // at the end. + __m128i compactmask = + _mm_loadu_si128(reinterpret_cast(pshufb_combine_table + pop1 * 8)); + __m128i answer = _mm_shuffle_epi8(pruned, compactmask); + _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer); + } + + template + simdjson_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + }; + + // Signed bytes + template<> + struct simd8 : base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m128i _value) : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(_mm_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Order-sensitive comparisons + simdjson_inline simd8 max_val(const simd8 other) const { return _mm_max_epi8(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return _mm_min_epi8(*this, other); } + simdjson_inline simd8 operator>(const simd8 other) const { return _mm_cmpgt_epi8(*this, other); } + simdjson_inline simd8 operator<(const simd8 other) const { return _mm_cmpgt_epi8(other, *this); } + }; + + // Unsigned bytes + template<> + struct simd8: base8_numeric { + simdjson_inline simd8() : base8_numeric() {} + simdjson_inline simd8(const __m128i _value) : base8_numeric(_value) {} + // Splat constructor + simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdjson_inline simd8(const uint8_t* values) : simd8(load(values)) {} + // Member-by-member initialization + simdjson_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(_mm_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdjson_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Saturated math + simdjson_inline simd8 saturating_add(const simd8 other) const { return _mm_adds_epu8(*this, other); } + simdjson_inline simd8 saturating_sub(const simd8 other) const { return _mm_subs_epu8(*this, other); } + + // Order-specific operations + simdjson_inline simd8 max_val(const simd8 other) const { return _mm_max_epu8(*this, other); } + simdjson_inline simd8 min_val(const simd8 other) const { return _mm_min_epu8(*this, other); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdjson_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } + simdjson_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } + simdjson_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } + simdjson_inline simd8 operator>(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + simdjson_inline simd8 operator<(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdjson_inline simd8 bits_not_set() const { return *this == uint8_t(0); } + simdjson_inline simd8 bits_not_set(simd8 bits) const { return (*this & bits).bits_not_set(); } + simdjson_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } + simdjson_inline simd8 any_bits_set(simd8 bits) const { return ~this->bits_not_set(bits); } + simdjson_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; } + simdjson_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } + simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdjson_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm_testz_si128(*this, bits); } + simdjson_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } + template + simdjson_inline simd8 shr() const { return simd8(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } + template + simdjson_inline simd8 shl() const { return simd8(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + simdjson_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); } + }; + + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8& other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdjson_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8::load(ptr), simd8::load(ptr+16), simd8::load(ptr+32), simd8::load(ptr+48)} {} + + simdjson_inline void store(T ptr[64]) const { + this->chunks[0].store(ptr+sizeof(simd8)*0); + this->chunks[1].store(ptr+sizeof(simd8)*1); + this->chunks[2].store(ptr+sizeof(simd8)*2); + this->chunks[3].store(ptr+sizeof(simd8)*3); + } + + simdjson_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); + } + + simdjson_inline uint64_t compress(uint64_t mask, T * output) const { + this->chunks[0].compress(uint16_t(mask), output); + this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF)); + this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF)); + this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF)); + return 64 - count_ones(mask); + } + + simdjson_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() ); + uint64_t r1 = this->chunks[1].to_bitmask() ; + uint64_t r2 = this->chunks[2].to_bitmask() ; + uint64_t r3 = this->chunks[3].to_bitmask() ; + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); + } + + simdjson_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] == mask, + this->chunks[1] == mask, + this->chunks[2] == mask, + this->chunks[3] == mask + ).to_bitmask(); + } + + simdjson_inline uint64_t eq(const simd8x64 &other) const { + return simd8x64( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1], + this->chunks[2] == other.chunks[2], + this->chunks[3] == other.chunks[3] + ).to_bitmask(); + } + + simdjson_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] <= mask, + this->chunks[1] <= mask, + this->chunks[2] <= mask, + this->chunks[3] <= mask + ).to_bitmask(); + } + }; // struct simd8x64 + +} // namespace simd +} // unnamed namespace +} // namespace westmere +} // namespace simdjson + +#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H diff --git a/contrib/libs/simdjson/include/simdjson/westmere/stringparsing_defs.h b/contrib/libs/simdjson/include/simdjson/westmere/stringparsing_defs.h new file mode 100644 index 000000000000..439f19cbc731 --- /dev/null +++ b/contrib/libs/simdjson/include/simdjson/westmere/stringparsing_defs.h @@ -0,0 +1,47 @@ +#ifndef SIMDJSON_WESTMERE_STRINGPARSING_DEFS_H +#define SIMDJSON_WESTMERE_STRINGPARSING_DEFS_H + +#include "simdjson/westmere/bitmanipulation.h" +#include "simdjson/westmere/simd.h" + +namespace simdjson { +namespace westmere { +namespace { + +using namespace simd; + +// Holds backslashes and quotes locations. +struct backslash_and_quote { +public: + static constexpr uint32_t BYTES_PROCESSED = 32; + simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst); + + simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; } + simdjson_inline bool has_backslash() { return bs_bits != 0; } + simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); } + simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); } + + uint32_t bs_bits; + uint32_t quote_bits; +}; // struct backslash_and_quote + +simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) { + // this can read up to 31 bytes beyond the buffer size, but we require + // SIMDJSON_PADDING of padding + static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes"); + simd8 v0(src); + simd8 v1(src + 16); + v0.store(dst); + v1.store(dst + 16); + uint64_t bs_and_quote = simd8x64(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask(); + return { + uint32_t(bs_and_quote), // bs_bits + uint32_t(bs_and_quote >> 32) // quote_bits + }; +} + +} // unnamed namespace +} // namespace westmere +} // namespace simdjson + +#endif // SIMDJSON_WESTMERE_STRINGPARSING_DEFS_H diff --git a/contrib/libs/simdjson/src/arm64.cpp b/contrib/libs/simdjson/src/arm64.cpp new file mode 100644 index 000000000000..436757d53227 --- /dev/null +++ b/contrib/libs/simdjson/src/arm64.cpp @@ -0,0 +1,172 @@ +#ifndef SIMDJSON_SRC_ARM64_CPP +#define SIMDJSON_SRC_ARM64_CPP + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include +#include + +#include +#include +#include +#include + +// +// Stage 1 +// +namespace simdjson { +namespace arm64 { + +simdjson_warn_unused error_code implementation::create_dom_parser_implementation( + size_t capacity, + size_t max_depth, + std::unique_ptr& dst +) const noexcept { + dst.reset( new (std::nothrow) dom_parser_implementation() ); + if (!dst) { return MEMALLOC; } + if (auto err = dst->set_capacity(capacity)) + return err; + if (auto err = dst->set_max_depth(max_depth)) + return err; + return SUCCESS; +} + +namespace { + +using namespace simd; + +simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64& in) { + // Functional programming causes trouble with Visual Studio. + // Keeping this version in comments since it is much nicer: + // auto v = in.map([&](simd8 chunk) { + // auto nib_lo = chunk & 0xf; + // auto nib_hi = chunk.shr<4>(); + // auto shuf_lo = nib_lo.lookup_16(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); + // auto shuf_hi = nib_hi.lookup_16(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0); + // return shuf_lo & shuf_hi; + // }); + const simd8 table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0); + const simd8 table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0); + + simd8x64 v( + (in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2), + (in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2), + (in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2), + (in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2) + ); + + + // We compute whitespace and op separately. If the code later only use one or the + // other, given the fact that all functions are aggressively inlined, we can + // hope that useless computations will be omitted. This is namely case when + // minifying (we only need whitespace). *However* if we only need spaces, + // it is likely that we will still compute 'v' above with two lookup_16: one + // could do it a bit cheaper. This is in contrast with the x64 implementations + // where we can, efficiently, do the white space and structural matching + // separately. One reason for this difference is that on ARM NEON, the table + // lookups either zero or leave unchanged the characters exceeding 0xF whereas + // on x64, the equivalent instruction (pshufb) automatically applies a mask, + // ignoring the 4 most significant bits. Thus the x64 implementation is + // optimized differently. This being said, if you use this code strictly + // just for minification (or just to identify the structural characters), + // there is a small untaken optimization opportunity here. We deliberately + // do not pick it up. + + uint64_t op = simd8x64( + v.chunks[0].any_bits_set(0x7), + v.chunks[1].any_bits_set(0x7), + v.chunks[2].any_bits_set(0x7), + v.chunks[3].any_bits_set(0x7) + ).to_bitmask(); + + uint64_t whitespace = simd8x64( + v.chunks[0].any_bits_set(0x18), + v.chunks[1].any_bits_set(0x18), + v.chunks[2].any_bits_set(0x18), + v.chunks[3].any_bits_set(0x18) + ).to_bitmask(); + + return { whitespace, op }; +} + +simdjson_inline bool is_ascii(const simd8x64& input) { + simd8 bits = input.reduce_or(); + return bits.max_val() < 0x80u; +} + +simdjson_unused simdjson_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1 >= uint8_t(0xc0u); + simd8 is_third_byte = prev2 >= uint8_t(0xe0u); + simd8 is_fourth_byte = prev3 >= uint8_t(0xf0u); + // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well. + // This will work fine because we only have to report errors for cases with 0-1 lead bytes. + // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is + // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character. + // The error will be detected there. + return is_second_byte ^ is_third_byte ^ is_fourth_byte; +} + +simdjson_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80 + simd8 is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80 + return is_third_byte | is_fourth_byte; +} + +} // unnamed namespace +} // namespace arm64 +} // namespace simdjson + +// +// Stage 2 +// + +// +// Implementation-specific overrides +// +namespace simdjson { +namespace arm64 { + +simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept { + return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept { + this->buf = _buf; + this->len = _len; + return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming); +} + +simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return arm64::stage1::generic_validate_utf8(buf,len); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { + return stage2::tape_builder::parse_document(*this, _doc); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { + return stage2::tape_builder::parse_document(*this, _doc); +} + +simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept { + return arm64::stringparsing::parse_string(src, dst, allow_replacement); +} + +simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept { + return arm64::stringparsing::parse_wobbly_string(src, dst); +} + +simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { + auto error = stage1(_buf, _len, stage1_mode::regular); + if (error) { return error; } + return stage2(_doc); +} + +} // namespace arm64 +} // namespace simdjson + +#include + +#endif // SIMDJSON_SRC_ARM64_CPP diff --git a/contrib/libs/simdjson/src/base.h b/contrib/libs/simdjson/src/base.h new file mode 100644 index 000000000000..67873b2f9c5c --- /dev/null +++ b/contrib/libs/simdjson/src/base.h @@ -0,0 +1,6 @@ +#ifndef SIMDJSON_SRC_BASE_H +#define SIMDJSON_SRC_BASE_H + +#include + +#endif // SIMDJSON_SRC_BASE_H diff --git a/contrib/libs/simdjson/src/fallback.cpp b/contrib/libs/simdjson/src/fallback.cpp new file mode 100644 index 000000000000..f8e87be06b84 --- /dev/null +++ b/contrib/libs/simdjson/src/fallback.cpp @@ -0,0 +1,410 @@ +#ifndef SIMDJSON_SRC_FALLBACK_CPP +#define SIMDJSON_SRC_FALLBACK_CPP + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +// +// Stage 1 +// + +namespace simdjson { +namespace fallback { + +simdjson_warn_unused error_code implementation::create_dom_parser_implementation( + size_t capacity, + size_t max_depth, + std::unique_ptr& dst +) const noexcept { + dst.reset( new (std::nothrow) SIMDJSON_IMPLEMENTATION::dom_parser_implementation() ); + if (!dst) { return MEMALLOC; } + if (auto err = dst->set_capacity(capacity)) + return err; + if (auto err = dst->set_max_depth(max_depth)) + return err; + return SUCCESS; +} + +namespace { +namespace stage1 { + +class structural_scanner { +public: + +simdjson_inline structural_scanner(dom_parser_implementation &_parser, stage1_mode _partial) + : buf{_parser.buf}, + next_structural_index{_parser.structural_indexes.get()}, + parser{_parser}, + len{static_cast(_parser.len)}, + partial{_partial} { +} + +simdjson_inline void add_structural() { + *next_structural_index = idx; + next_structural_index++; +} + +simdjson_inline bool is_continuation(uint8_t c) { + return (c & 0xc0) == 0x80; +} + +simdjson_inline void validate_utf8_character() { + // Continuation + if (simdjson_unlikely((buf[idx] & 0x40) == 0)) { + // extra continuation + error = UTF8_ERROR; + idx++; + return; + } + + // 2-byte + if ((buf[idx] & 0x20) == 0) { + // missing continuation + if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { + if (idx+1 > len && is_streaming(partial)) { idx = len; return; } + error = UTF8_ERROR; + idx++; + return; + } + // overlong: 1100000_ 10______ + if (buf[idx] <= 0xc1) { error = UTF8_ERROR; } + idx += 2; + return; + } + + // 3-byte + if ((buf[idx] & 0x10) == 0) { + // missing continuation + if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { + if (idx+2 > len && is_streaming(partial)) { idx = len; return; } + error = UTF8_ERROR; + idx++; + return; + } + // overlong: 11100000 100_____ ________ + if (buf[idx] == 0xe0 && buf[idx+1] <= 0x9f) { error = UTF8_ERROR; } + // surrogates: U+D800-U+DFFF 11101101 101_____ + if (buf[idx] == 0xed && buf[idx+1] >= 0xa0) { error = UTF8_ERROR; } + idx += 3; + return; + } + + // 4-byte + // missing continuation + if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { + if (idx+2 > len && is_streaming(partial)) { idx = len; return; } + error = UTF8_ERROR; + idx++; + return; + } + // overlong: 11110000 1000____ ________ ________ + if (buf[idx] == 0xf0 && buf[idx+1] <= 0x8f) { error = UTF8_ERROR; } + // too large: > U+10FFFF: + // 11110100 (1001|101_)____ + // 1111(1___|011_|0101) 10______ + // also includes 5, 6, 7 and 8 byte characters: + // 11111___ + if (buf[idx] == 0xf4 && buf[idx+1] >= 0x90) { error = UTF8_ERROR; } + if (buf[idx] >= 0xf5) { error = UTF8_ERROR; } + idx += 4; +} + +// Returns true if the string is unclosed. +simdjson_inline bool validate_string() { + idx++; // skip first quote + while (idx < len && buf[idx] != '"') { + if (buf[idx] == '\\') { + idx += 2; + } else if (simdjson_unlikely(buf[idx] & 0x80)) { + validate_utf8_character(); + } else { + if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; } + idx++; + } + } + if (idx >= len) { return true; } + return false; +} + +simdjson_inline bool is_whitespace_or_operator(uint8_t c) { + switch (c) { + case '{': case '}': case '[': case ']': case ',': case ':': + case ' ': case '\r': case '\n': case '\t': + return true; + default: + return false; + } +} + +// +// Parse the entire input in STEP_SIZE-byte chunks. +// +simdjson_inline error_code scan() { + bool unclosed_string = false; + for (;idx 0) { + if(parser.structural_indexes[0] == 0) { + // If the buffer is partial and we started at index 0 but the document is + // incomplete, it's too big to parse. + return CAPACITY; + } else { + // It is possible that the document could be parsed, we just had a lot + // of white space. + parser.n_structural_indexes = 0; + return EMPTY; + } + } + parser.n_structural_indexes = new_structural_indexes; + } else if(partial == stage1_mode::streaming_final) { + if(unclosed_string) { parser.n_structural_indexes--; } + // We truncate the input to the end of the last complete document (or zero). + // Because partial == stage1_mode::streaming_final, it means that we may + // silently ignore trailing garbage. Though it sounds bad, we do it + // deliberately because many people who have streams of JSON documents + // will truncate them for processing. E.g., imagine that you are uncompressing + // the data from a size file or receiving it in chunks from the network. You + // may not know where exactly the last document will be. Meanwhile the + // document_stream instances allow people to know the JSON documents they are + // parsing (see the iterator.source() method). + parser.n_structural_indexes = find_next_document_index(parser); + // We store the initial n_structural_indexes so that the client can see + // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes, + // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len, + // otherwise, it will copy some prior index. + parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes]; + // This next line is critical, do not change it unless you understand what you are + // doing. + parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); + if (parser.n_structural_indexes == 0) { return EMPTY; } + } else if(unclosed_string) { error = UNCLOSED_STRING; } + return error; +} + +private: + const uint8_t *buf; + uint32_t *next_structural_index; + dom_parser_implementation &parser; + uint32_t len; + uint32_t idx{0}; + error_code error{SUCCESS}; + stage1_mode partial; +}; // structural_scanner + +} // namespace stage1 +} // unnamed namespace + +simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode partial) noexcept { + this->buf = _buf; + this->len = _len; + stage1::structural_scanner scanner(*this, partial); + return scanner.scan(); +} + +// big table for the minifier +static uint8_t jump_table[256 * 3] = { + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, + 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, + 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, +}; + +simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept { + size_t i = 0, pos = 0; + uint8_t quote = 0; + uint8_t nonescape = 1; + + while (i < len) { + unsigned char c = buf[i]; + uint8_t *meta = jump_table + 3 * c; + + quote = quote ^ (meta[0] & nonescape); + dst[pos] = c; + pos += meta[2] | quote; + + i += 1; + nonescape = uint8_t(~nonescape) | (meta[1]); + } + dst_len = pos; // we intentionally do not work with a reference + // for fear of aliasing + return quote ? UNCLOSED_STRING : SUCCESS; +} + +// credit: based on code from Google Fuchsia (Apache Licensed) +simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + const uint8_t *data = reinterpret_cast(buf); + uint64_t pos = 0; + uint32_t code_point = 0; + while (pos < len) { + // check of the next 8 bytes are ascii. + uint64_t next_pos = pos + 16; + if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v1; + memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + pos = next_pos; + continue; + } + } + unsigned char byte = data[pos]; + if (byte < 0x80) { + pos++; + continue; + } else if ((byte & 0xe0) == 0xc0) { + next_pos = pos + 2; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0xc0) != 0x80) { return false; } + // range check + code_point = (byte & 0x1f) << 6 | (data[pos + 1] & 0x3f); + if (code_point < 0x80 || 0x7ff < code_point) { return false; } + } else if ((byte & 0xf0) == 0xe0) { + next_pos = pos + 3; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0xc0) != 0x80) { return false; } + if ((data[pos + 2] & 0xc0) != 0x80) { return false; } + // range check + code_point = (byte & 0x0f) << 12 | + (data[pos + 1] & 0x3f) << 6 | + (data[pos + 2] & 0x3f); + if (code_point < 0x800 || 0xffff < code_point || + (0xd7ff < code_point && code_point < 0xe000)) { + return false; + } + } else if ((byte & 0xf8) == 0xf0) { // 0b11110000 + next_pos = pos + 4; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0xc0) != 0x80) { return false; } + if ((data[pos + 2] & 0xc0) != 0x80) { return false; } + if ((data[pos + 3] & 0xc0) != 0x80) { return false; } + // range check + code_point = + (byte & 0x07) << 18 | (data[pos + 1] & 0x3f) << 12 | + (data[pos + 2] & 0x3f) << 6 | (data[pos + 3] & 0x3f); + if (code_point <= 0xffff || 0x10ffff < code_point) { return false; } + } else { + // we may have a continuation + return false; + } + pos = next_pos; + } + return true; +} + +} // namespace fallback +} // namespace simdjson + +// +// Stage 2 +// + +namespace simdjson { +namespace fallback { + +simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { + return stage2::tape_builder::parse_document(*this, _doc); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { + return stage2::tape_builder::parse_document(*this, _doc); +} + +simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept { + return fallback::stringparsing::parse_string(src, dst, replacement_char); +} + +simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept { + return fallback::stringparsing::parse_wobbly_string(src, dst); +} + +simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { + auto error = stage1(_buf, _len, stage1_mode::regular); + if (error) { return error; } + return stage2(_doc); +} + +} // namespace fallback +} // namespace simdjson + +#include + +#endif // SIMDJSON_SRC_FALLBACK_CPP diff --git a/contrib/libs/simdjson/src/from_chars.cpp b/contrib/libs/simdjson/src/from_chars.cpp new file mode 100644 index 000000000000..34d62a3d7d22 --- /dev/null +++ b/contrib/libs/simdjson/src/from_chars.cpp @@ -0,0 +1,606 @@ +#ifndef SIMDJSON_SRC_FROM_CHARS_CPP +#define SIMDJSON_SRC_FROM_CHARS_CPP + +#include + +#include +#include +#include + +namespace simdjson { +namespace internal { + +/** + * The code in the internal::from_chars function is meant to handle the floating-point number parsing + * when we have more than 19 digits in the decimal mantissa. This should only be seen + * in adversarial scenarios: we do not expect production systems to even produce + * such floating-point numbers. + * + * The parser is based on work by Nigel Tao (at https://github.com/google/wuffs/) + * who credits Ken Thompson for the design (via a reference to the Go source + * code). See + * https://github.com/google/wuffs/blob/aa46859ea40c72516deffa1b146121952d6dfd3b/internal/cgen/base/floatconv-submodule-data.c + * https://github.com/google/wuffs/blob/46cd8105f47ca07ae2ba8e6a7818ef9c0df6c152/internal/cgen/base/floatconv-submodule-code.c + * It is probably not very fast but it is a fallback that should almost never be + * called in real life. Google Wuffs is published under APL 2.0. + **/ + +namespace { +constexpr uint32_t max_digits = 768; +constexpr int32_t decimal_point_range = 2047; +} // namespace + +struct adjusted_mantissa { + uint64_t mantissa; + int power2; + adjusted_mantissa() : mantissa(0), power2(0) {} +}; + +struct decimal { + uint32_t num_digits; + int32_t decimal_point; + bool negative; + bool truncated; + uint8_t digits[max_digits]; +}; + +template struct binary_format { + static constexpr int mantissa_explicit_bits(); + static constexpr int minimum_exponent(); + static constexpr int infinite_power(); + static constexpr int sign_index(); +}; + +template <> constexpr int binary_format::mantissa_explicit_bits() { + return 52; +} + +template <> constexpr int binary_format::minimum_exponent() { + return -1023; +} +template <> constexpr int binary_format::infinite_power() { + return 0x7FF; +} + +template <> constexpr int binary_format::sign_index() { return 63; } + +bool is_integer(char c) noexcept { return (c >= '0' && c <= '9'); } + +// This should always succeed since it follows a call to parse_number. +decimal parse_decimal(const char *&p) noexcept { + decimal answer; + answer.num_digits = 0; + answer.decimal_point = 0; + answer.truncated = false; + answer.negative = (*p == '-'); + if ((*p == '-') || (*p == '+')) { + ++p; + } + + while (*p == '0') { + ++p; + } + while (is_integer(*p)) { + if (answer.num_digits < max_digits) { + answer.digits[answer.num_digits] = uint8_t(*p - '0'); + } + answer.num_digits++; + ++p; + } + if (*p == '.') { + ++p; + const char *first_after_period = p; + // if we have not yet encountered a zero, we have to skip it as well + if (answer.num_digits == 0) { + // skip zeros + while (*p == '0') { + ++p; + } + } + while (is_integer(*p)) { + if (answer.num_digits < max_digits) { + answer.digits[answer.num_digits] = uint8_t(*p - '0'); + } + answer.num_digits++; + ++p; + } + answer.decimal_point = int32_t(first_after_period - p); + } + if(answer.num_digits > 0) { + const char *preverse = p - 1; + int32_t trailing_zeros = 0; + while ((*preverse == '0') || (*preverse == '.')) { + if(*preverse == '0') { trailing_zeros++; }; + --preverse; + } + answer.decimal_point += int32_t(answer.num_digits); + answer.num_digits -= uint32_t(trailing_zeros); + } + if(answer.num_digits > max_digits ) { + answer.num_digits = max_digits; + answer.truncated = true; + } + if (('e' == *p) || ('E' == *p)) { + ++p; + bool neg_exp = false; + if ('-' == *p) { + neg_exp = true; + ++p; + } else if ('+' == *p) { + ++p; + } + int32_t exp_number = 0; // exponential part + while (is_integer(*p)) { + uint8_t digit = uint8_t(*p - '0'); + if (exp_number < 0x10000) { + exp_number = 10 * exp_number + digit; + } + ++p; + } + answer.decimal_point += (neg_exp ? -exp_number : exp_number); + } + return answer; +} + +// This should always succeed since it follows a call to parse_number. +// Will not read at or beyond the "end" pointer. +decimal parse_decimal(const char *&p, const char * end) noexcept { + decimal answer; + answer.num_digits = 0; + answer.decimal_point = 0; + answer.truncated = false; + if(p == end) { return answer; } // should never happen + answer.negative = (*p == '-'); + if ((*p == '-') || (*p == '+')) { + ++p; + } + + while ((p != end) && (*p == '0')) { + ++p; + } + while ((p != end) && is_integer(*p)) { + if (answer.num_digits < max_digits) { + answer.digits[answer.num_digits] = uint8_t(*p - '0'); + } + answer.num_digits++; + ++p; + } + if ((p != end) && (*p == '.')) { + ++p; + if(p == end) { return answer; } // should never happen + const char *first_after_period = p; + // if we have not yet encountered a zero, we have to skip it as well + if (answer.num_digits == 0) { + // skip zeros + while (*p == '0') { + ++p; + } + } + while ((p != end) && is_integer(*p)) { + if (answer.num_digits < max_digits) { + answer.digits[answer.num_digits] = uint8_t(*p - '0'); + } + answer.num_digits++; + ++p; + } + answer.decimal_point = int32_t(first_after_period - p); + } + if(answer.num_digits > 0) { + const char *preverse = p - 1; + int32_t trailing_zeros = 0; + while ((*preverse == '0') || (*preverse == '.')) { + if(*preverse == '0') { trailing_zeros++; }; + --preverse; + } + answer.decimal_point += int32_t(answer.num_digits); + answer.num_digits -= uint32_t(trailing_zeros); + } + if(answer.num_digits > max_digits ) { + answer.num_digits = max_digits; + answer.truncated = true; + } + if ((p != end) && (('e' == *p) || ('E' == *p))) { + ++p; + if(p == end) { return answer; } // should never happen + bool neg_exp = false; + if ('-' == *p) { + neg_exp = true; + ++p; + } else if ('+' == *p) { + ++p; + } + int32_t exp_number = 0; // exponential part + while ((p != end) && is_integer(*p)) { + uint8_t digit = uint8_t(*p - '0'); + if (exp_number < 0x10000) { + exp_number = 10 * exp_number + digit; + } + ++p; + } + answer.decimal_point += (neg_exp ? -exp_number : exp_number); + } + return answer; +} + +namespace { + +// remove all final zeroes +inline void trim(decimal &h) { + while ((h.num_digits > 0) && (h.digits[h.num_digits - 1] == 0)) { + h.num_digits--; + } +} + +uint32_t number_of_digits_decimal_left_shift(decimal &h, uint32_t shift) { + shift &= 63; + const static uint16_t number_of_digits_decimal_left_shift_table[65] = { + 0x0000, 0x0800, 0x0801, 0x0803, 0x1006, 0x1009, 0x100D, 0x1812, 0x1817, + 0x181D, 0x2024, 0x202B, 0x2033, 0x203C, 0x2846, 0x2850, 0x285B, 0x3067, + 0x3073, 0x3080, 0x388E, 0x389C, 0x38AB, 0x38BB, 0x40CC, 0x40DD, 0x40EF, + 0x4902, 0x4915, 0x4929, 0x513E, 0x5153, 0x5169, 0x5180, 0x5998, 0x59B0, + 0x59C9, 0x61E3, 0x61FD, 0x6218, 0x6A34, 0x6A50, 0x6A6D, 0x6A8B, 0x72AA, + 0x72C9, 0x72E9, 0x7B0A, 0x7B2B, 0x7B4D, 0x8370, 0x8393, 0x83B7, 0x83DC, + 0x8C02, 0x8C28, 0x8C4F, 0x9477, 0x949F, 0x94C8, 0x9CF2, 0x051C, 0x051C, + 0x051C, 0x051C, + }; + uint32_t x_a = number_of_digits_decimal_left_shift_table[shift]; + uint32_t x_b = number_of_digits_decimal_left_shift_table[shift + 1]; + uint32_t num_new_digits = x_a >> 11; + uint32_t pow5_a = 0x7FF & x_a; + uint32_t pow5_b = 0x7FF & x_b; + const static uint8_t + number_of_digits_decimal_left_shift_table_powers_of_5[0x051C] = { + 5, 2, 5, 1, 2, 5, 6, 2, 5, 3, 1, 2, 5, 1, 5, 6, 2, 5, 7, 8, 1, 2, 5, + 3, 9, 0, 6, 2, 5, 1, 9, 5, 3, 1, 2, 5, 9, 7, 6, 5, 6, 2, 5, 4, 8, 8, + 2, 8, 1, 2, 5, 2, 4, 4, 1, 4, 0, 6, 2, 5, 1, 2, 2, 0, 7, 0, 3, 1, 2, + 5, 6, 1, 0, 3, 5, 1, 5, 6, 2, 5, 3, 0, 5, 1, 7, 5, 7, 8, 1, 2, 5, 1, + 5, 2, 5, 8, 7, 8, 9, 0, 6, 2, 5, 7, 6, 2, 9, 3, 9, 4, 5, 3, 1, 2, 5, + 3, 8, 1, 4, 6, 9, 7, 2, 6, 5, 6, 2, 5, 1, 9, 0, 7, 3, 4, 8, 6, 3, 2, + 8, 1, 2, 5, 9, 5, 3, 6, 7, 4, 3, 1, 6, 4, 0, 6, 2, 5, 4, 7, 6, 8, 3, + 7, 1, 5, 8, 2, 0, 3, 1, 2, 5, 2, 3, 8, 4, 1, 8, 5, 7, 9, 1, 0, 1, 5, + 6, 2, 5, 1, 1, 9, 2, 0, 9, 2, 8, 9, 5, 5, 0, 7, 8, 1, 2, 5, 5, 9, 6, + 0, 4, 6, 4, 4, 7, 7, 5, 3, 9, 0, 6, 2, 5, 2, 9, 8, 0, 2, 3, 2, 2, 3, + 8, 7, 6, 9, 5, 3, 1, 2, 5, 1, 4, 9, 0, 1, 1, 6, 1, 1, 9, 3, 8, 4, 7, + 6, 5, 6, 2, 5, 7, 4, 5, 0, 5, 8, 0, 5, 9, 6, 9, 2, 3, 8, 2, 8, 1, 2, + 5, 3, 7, 2, 5, 2, 9, 0, 2, 9, 8, 4, 6, 1, 9, 1, 4, 0, 6, 2, 5, 1, 8, + 6, 2, 6, 4, 5, 1, 4, 9, 2, 3, 0, 9, 5, 7, 0, 3, 1, 2, 5, 9, 3, 1, 3, + 2, 2, 5, 7, 4, 6, 1, 5, 4, 7, 8, 5, 1, 5, 6, 2, 5, 4, 6, 5, 6, 6, 1, + 2, 8, 7, 3, 0, 7, 7, 3, 9, 2, 5, 7, 8, 1, 2, 5, 2, 3, 2, 8, 3, 0, 6, + 4, 3, 6, 5, 3, 8, 6, 9, 6, 2, 8, 9, 0, 6, 2, 5, 1, 1, 6, 4, 1, 5, 3, + 2, 1, 8, 2, 6, 9, 3, 4, 8, 1, 4, 4, 5, 3, 1, 2, 5, 5, 8, 2, 0, 7, 6, + 6, 0, 9, 1, 3, 4, 6, 7, 4, 0, 7, 2, 2, 6, 5, 6, 2, 5, 2, 9, 1, 0, 3, + 8, 3, 0, 4, 5, 6, 7, 3, 3, 7, 0, 3, 6, 1, 3, 2, 8, 1, 2, 5, 1, 4, 5, + 5, 1, 9, 1, 5, 2, 2, 8, 3, 6, 6, 8, 5, 1, 8, 0, 6, 6, 4, 0, 6, 2, 5, + 7, 2, 7, 5, 9, 5, 7, 6, 1, 4, 1, 8, 3, 4, 2, 5, 9, 0, 3, 3, 2, 0, 3, + 1, 2, 5, 3, 6, 3, 7, 9, 7, 8, 8, 0, 7, 0, 9, 1, 7, 1, 2, 9, 5, 1, 6, + 6, 0, 1, 5, 6, 2, 5, 1, 8, 1, 8, 9, 8, 9, 4, 0, 3, 5, 4, 5, 8, 5, 6, + 4, 7, 5, 8, 3, 0, 0, 7, 8, 1, 2, 5, 9, 0, 9, 4, 9, 4, 7, 0, 1, 7, 7, + 2, 9, 2, 8, 2, 3, 7, 9, 1, 5, 0, 3, 9, 0, 6, 2, 5, 4, 5, 4, 7, 4, 7, + 3, 5, 0, 8, 8, 6, 4, 6, 4, 1, 1, 8, 9, 5, 7, 5, 1, 9, 5, 3, 1, 2, 5, + 2, 2, 7, 3, 7, 3, 6, 7, 5, 4, 4, 3, 2, 3, 2, 0, 5, 9, 4, 7, 8, 7, 5, + 9, 7, 6, 5, 6, 2, 5, 1, 1, 3, 6, 8, 6, 8, 3, 7, 7, 2, 1, 6, 1, 6, 0, + 2, 9, 7, 3, 9, 3, 7, 9, 8, 8, 2, 8, 1, 2, 5, 5, 6, 8, 4, 3, 4, 1, 8, + 8, 6, 0, 8, 0, 8, 0, 1, 4, 8, 6, 9, 6, 8, 9, 9, 4, 1, 4, 0, 6, 2, 5, + 2, 8, 4, 2, 1, 7, 0, 9, 4, 3, 0, 4, 0, 4, 0, 0, 7, 4, 3, 4, 8, 4, 4, + 9, 7, 0, 7, 0, 3, 1, 2, 5, 1, 4, 2, 1, 0, 8, 5, 4, 7, 1, 5, 2, 0, 2, + 0, 0, 3, 7, 1, 7, 4, 2, 2, 4, 8, 5, 3, 5, 1, 5, 6, 2, 5, 7, 1, 0, 5, + 4, 2, 7, 3, 5, 7, 6, 0, 1, 0, 0, 1, 8, 5, 8, 7, 1, 1, 2, 4, 2, 6, 7, + 5, 7, 8, 1, 2, 5, 3, 5, 5, 2, 7, 1, 3, 6, 7, 8, 8, 0, 0, 5, 0, 0, 9, + 2, 9, 3, 5, 5, 6, 2, 1, 3, 3, 7, 8, 9, 0, 6, 2, 5, 1, 7, 7, 6, 3, 5, + 6, 8, 3, 9, 4, 0, 0, 2, 5, 0, 4, 6, 4, 6, 7, 7, 8, 1, 0, 6, 6, 8, 9, + 4, 5, 3, 1, 2, 5, 8, 8, 8, 1, 7, 8, 4, 1, 9, 7, 0, 0, 1, 2, 5, 2, 3, + 2, 3, 3, 8, 9, 0, 5, 3, 3, 4, 4, 7, 2, 6, 5, 6, 2, 5, 4, 4, 4, 0, 8, + 9, 2, 0, 9, 8, 5, 0, 0, 6, 2, 6, 1, 6, 1, 6, 9, 4, 5, 2, 6, 6, 7, 2, + 3, 6, 3, 2, 8, 1, 2, 5, 2, 2, 2, 0, 4, 4, 6, 0, 4, 9, 2, 5, 0, 3, 1, + 3, 0, 8, 0, 8, 4, 7, 2, 6, 3, 3, 3, 6, 1, 8, 1, 6, 4, 0, 6, 2, 5, 1, + 1, 1, 0, 2, 2, 3, 0, 2, 4, 6, 2, 5, 1, 5, 6, 5, 4, 0, 4, 2, 3, 6, 3, + 1, 6, 6, 8, 0, 9, 0, 8, 2, 0, 3, 1, 2, 5, 5, 5, 5, 1, 1, 1, 5, 1, 2, + 3, 1, 2, 5, 7, 8, 2, 7, 0, 2, 1, 1, 8, 1, 5, 8, 3, 4, 0, 4, 5, 4, 1, + 0, 1, 5, 6, 2, 5, 2, 7, 7, 5, 5, 5, 7, 5, 6, 1, 5, 6, 2, 8, 9, 1, 3, + 5, 1, 0, 5, 9, 0, 7, 9, 1, 7, 0, 2, 2, 7, 0, 5, 0, 7, 8, 1, 2, 5, 1, + 3, 8, 7, 7, 7, 8, 7, 8, 0, 7, 8, 1, 4, 4, 5, 6, 7, 5, 5, 2, 9, 5, 3, + 9, 5, 8, 5, 1, 1, 3, 5, 2, 5, 3, 9, 0, 6, 2, 5, 6, 9, 3, 8, 8, 9, 3, + 9, 0, 3, 9, 0, 7, 2, 2, 8, 3, 7, 7, 6, 4, 7, 6, 9, 7, 9, 2, 5, 5, 6, + 7, 6, 2, 6, 9, 5, 3, 1, 2, 5, 3, 4, 6, 9, 4, 4, 6, 9, 5, 1, 9, 5, 3, + 6, 1, 4, 1, 8, 8, 8, 2, 3, 8, 4, 8, 9, 6, 2, 7, 8, 3, 8, 1, 3, 4, 7, + 6, 5, 6, 2, 5, 1, 7, 3, 4, 7, 2, 3, 4, 7, 5, 9, 7, 6, 8, 0, 7, 0, 9, + 4, 4, 1, 1, 9, 2, 4, 4, 8, 1, 3, 9, 1, 9, 0, 6, 7, 3, 8, 2, 8, 1, 2, + 5, 8, 6, 7, 3, 6, 1, 7, 3, 7, 9, 8, 8, 4, 0, 3, 5, 4, 7, 2, 0, 5, 9, + 6, 2, 2, 4, 0, 6, 9, 5, 9, 5, 3, 3, 6, 9, 1, 4, 0, 6, 2, 5, + }; + const uint8_t *pow5 = + &number_of_digits_decimal_left_shift_table_powers_of_5[pow5_a]; + uint32_t i = 0; + uint32_t n = pow5_b - pow5_a; + for (; i < n; i++) { + if (i >= h.num_digits) { + return num_new_digits - 1; + } else if (h.digits[i] == pow5[i]) { + continue; + } else if (h.digits[i] < pow5[i]) { + return num_new_digits - 1; + } else { + return num_new_digits; + } + } + return num_new_digits; +} + +} // end of anonymous namespace + +uint64_t round(decimal &h) { + if ((h.num_digits == 0) || (h.decimal_point < 0)) { + return 0; + } else if (h.decimal_point > 18) { + return UINT64_MAX; + } + // at this point, we know that h.decimal_point >= 0 + uint32_t dp = uint32_t(h.decimal_point); + uint64_t n = 0; + for (uint32_t i = 0; i < dp; i++) { + n = (10 * n) + ((i < h.num_digits) ? h.digits[i] : 0); + } + bool round_up = false; + if (dp < h.num_digits) { + round_up = h.digits[dp] >= 5; // normally, we round up + // but we may need to round to even! + if ((h.digits[dp] == 5) && (dp + 1 == h.num_digits)) { + round_up = h.truncated || ((dp > 0) && (1 & h.digits[dp - 1])); + } + } + if (round_up) { + n++; + } + return n; +} + +// computes h * 2^-shift +void decimal_left_shift(decimal &h, uint32_t shift) { + if (h.num_digits == 0) { + return; + } + uint32_t num_new_digits = number_of_digits_decimal_left_shift(h, shift); + int32_t read_index = int32_t(h.num_digits - 1); + uint32_t write_index = h.num_digits - 1 + num_new_digits; + uint64_t n = 0; + + while (read_index >= 0) { + n += uint64_t(h.digits[read_index]) << shift; + uint64_t quotient = n / 10; + uint64_t remainder = n - (10 * quotient); + if (write_index < max_digits) { + h.digits[write_index] = uint8_t(remainder); + } else if (remainder > 0) { + h.truncated = true; + } + n = quotient; + write_index--; + read_index--; + } + while (n > 0) { + uint64_t quotient = n / 10; + uint64_t remainder = n - (10 * quotient); + if (write_index < max_digits) { + h.digits[write_index] = uint8_t(remainder); + } else if (remainder > 0) { + h.truncated = true; + } + n = quotient; + write_index--; + } + h.num_digits += num_new_digits; + if (h.num_digits > max_digits) { + h.num_digits = max_digits; + } + h.decimal_point += int32_t(num_new_digits); + trim(h); +} + +// computes h * 2^shift +void decimal_right_shift(decimal &h, uint32_t shift) { + uint32_t read_index = 0; + uint32_t write_index = 0; + + uint64_t n = 0; + + while ((n >> shift) == 0) { + if (read_index < h.num_digits) { + n = (10 * n) + h.digits[read_index++]; + } else if (n == 0) { + return; + } else { + while ((n >> shift) == 0) { + n = 10 * n; + read_index++; + } + break; + } + } + h.decimal_point -= int32_t(read_index - 1); + if (h.decimal_point < -decimal_point_range) { // it is zero + h.num_digits = 0; + h.decimal_point = 0; + h.negative = false; + h.truncated = false; + return; + } + uint64_t mask = (uint64_t(1) << shift) - 1; + while (read_index < h.num_digits) { + uint8_t new_digit = uint8_t(n >> shift); + n = (10 * (n & mask)) + h.digits[read_index++]; + h.digits[write_index++] = new_digit; + } + while (n > 0) { + uint8_t new_digit = uint8_t(n >> shift); + n = 10 * (n & mask); + if (write_index < max_digits) { + h.digits[write_index++] = new_digit; + } else if (new_digit > 0) { + h.truncated = true; + } + } + h.num_digits = write_index; + trim(h); +} + +template adjusted_mantissa compute_float(decimal &d) { + adjusted_mantissa answer; + if (d.num_digits == 0) { + // should be zero + answer.power2 = 0; + answer.mantissa = 0; + return answer; + } + // At this point, going further, we can assume that d.num_digits > 0. + // We want to guard against excessive decimal point values because + // they can result in long running times. Indeed, we do + // shifts by at most 60 bits. We have that log(10**400)/log(2**60) ~= 22 + // which is fine, but log(10**299995)/log(2**60) ~= 16609 which is not + // fine (runs for a long time). + // + if(d.decimal_point < -324) { + // We have something smaller than 1e-324 which is always zero + // in binary64 and binary32. + // It should be zero. + answer.power2 = 0; + answer.mantissa = 0; + return answer; + } else if(d.decimal_point >= 310) { + // We have something at least as large as 0.1e310 which is + // always infinite. + answer.power2 = binary::infinite_power(); + answer.mantissa = 0; + return answer; + } + + static const uint32_t max_shift = 60; + static const uint32_t num_powers = 19; + static const uint8_t powers[19] = { + 0, 3, 6, 9, 13, 16, 19, 23, 26, 29, // + 33, 36, 39, 43, 46, 49, 53, 56, 59, // + }; + int32_t exp2 = 0; + while (d.decimal_point > 0) { + uint32_t n = uint32_t(d.decimal_point); + uint32_t shift = (n < num_powers) ? powers[n] : max_shift; + decimal_right_shift(d, shift); + if (d.decimal_point < -decimal_point_range) { + // should be zero + answer.power2 = 0; + answer.mantissa = 0; + return answer; + } + exp2 += int32_t(shift); + } + // We shift left toward [1/2 ... 1]. + while (d.decimal_point <= 0) { + uint32_t shift; + if (d.decimal_point == 0) { + if (d.digits[0] >= 5) { + break; + } + shift = (d.digits[0] < 2) ? 2 : 1; + } else { + uint32_t n = uint32_t(-d.decimal_point); + shift = (n < num_powers) ? powers[n] : max_shift; + } + decimal_left_shift(d, shift); + if (d.decimal_point > decimal_point_range) { + // we want to get infinity: + answer.power2 = 0xFF; + answer.mantissa = 0; + return answer; + } + exp2 -= int32_t(shift); + } + // We are now in the range [1/2 ... 1] but the binary format uses [1 ... 2]. + exp2--; + constexpr int32_t minimum_exponent = binary::minimum_exponent(); + while ((minimum_exponent + 1) > exp2) { + uint32_t n = uint32_t((minimum_exponent + 1) - exp2); + if (n > max_shift) { + n = max_shift; + } + decimal_right_shift(d, n); + exp2 += int32_t(n); + } + if ((exp2 - minimum_exponent) >= binary::infinite_power()) { + answer.power2 = binary::infinite_power(); + answer.mantissa = 0; + return answer; + } + + const int mantissa_size_in_bits = binary::mantissa_explicit_bits() + 1; + decimal_left_shift(d, mantissa_size_in_bits); + + uint64_t mantissa = round(d); + // It is possible that we have an overflow, in which case we need + // to shift back. + if (mantissa >= (uint64_t(1) << mantissa_size_in_bits)) { + decimal_right_shift(d, 1); + exp2 += 1; + mantissa = round(d); + if ((exp2 - minimum_exponent) >= binary::infinite_power()) { + answer.power2 = binary::infinite_power(); + answer.mantissa = 0; + return answer; + } + } + answer.power2 = exp2 - binary::minimum_exponent(); + if (mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) { + answer.power2--; + } + answer.mantissa = + mantissa & ((uint64_t(1) << binary::mantissa_explicit_bits()) - 1); + return answer; +} + +template +adjusted_mantissa parse_long_mantissa(const char *first) { + decimal d = parse_decimal(first); + return compute_float(d); +} + +template +adjusted_mantissa parse_long_mantissa(const char *first, const char *end) { + decimal d = parse_decimal(first, end); + return compute_float(d); +} + +double from_chars(const char *first) noexcept { + bool negative = first[0] == '-'; + if (negative) { + first++; + } + adjusted_mantissa am = parse_long_mantissa>(first); + uint64_t word = am.mantissa; + word |= uint64_t(am.power2) + << binary_format::mantissa_explicit_bits(); + word = negative ? word | (uint64_t(1) << binary_format::sign_index()) + : word; + double value; + std::memcpy(&value, &word, sizeof(double)); + return value; +} + + +double from_chars(const char *first, const char *end) noexcept { + bool negative = first[0] == '-'; + if (negative) { + first++; + } + adjusted_mantissa am = parse_long_mantissa>(first, end); + uint64_t word = am.mantissa; + word |= uint64_t(am.power2) + << binary_format::mantissa_explicit_bits(); + word = negative ? word | (uint64_t(1) << binary_format::sign_index()) + : word; + double value; + std::memcpy(&value, &word, sizeof(double)); + return value; +} + +} // internal +} // simdjson + +#endif // SIMDJSON_SRC_FROM_CHARS_CPP \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/amalgamated.h b/contrib/libs/simdjson/src/generic/amalgamated.h new file mode 100644 index 000000000000..32154f602e59 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/amalgamated.h @@ -0,0 +1,7 @@ +#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_SRC_GENERIC_DEPENDENCIES_H) +#error generic/dependencies.h must be included before generic/amalgamated.h! +#endif + +#include +#include +#include diff --git a/contrib/libs/simdjson/src/generic/base.h b/contrib/libs/simdjson/src/generic/base.h new file mode 100644 index 000000000000..77947651a8f8 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/base.h @@ -0,0 +1,19 @@ +#ifndef SIMDJSON_SRC_GENERIC_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_BASE_H +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { + +struct json_character_block; + +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_BASE_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/dependencies.h b/contrib/libs/simdjson/src/generic/dependencies.h new file mode 100644 index 000000000000..ce5a2f01111a --- /dev/null +++ b/contrib/libs/simdjson/src/generic/dependencies.h @@ -0,0 +1,10 @@ +#ifdef SIMDJSON_CONDITIONAL_INCLUDE +#error generic/dependencies.h must be included before defining SIMDJSON_CONDITIONAL_INCLUDE! +#endif + +#ifndef SIMDJSON_SRC_GENERIC_DEPENDENCIES_H +#define SIMDJSON_SRC_GENERIC_DEPENDENCIES_H + +#include + +#endif // SIMDJSON_SRC_GENERIC_DEPENDENCIES_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/dom_parser_implementation.h b/contrib/libs/simdjson/src/generic/dom_parser_implementation.h new file mode 100644 index 000000000000..20f7813fcae5 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/dom_parser_implementation.h @@ -0,0 +1,21 @@ +#ifndef SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// Interface a dom parser implementation must fulfill +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { + +simdjson_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3); +simdjson_inline bool is_ascii(const simd8x64& input); + +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/json_character_block.h b/contrib/libs/simdjson/src/generic/json_character_block.h new file mode 100644 index 000000000000..7cce34c83b35 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/json_character_block.h @@ -0,0 +1,27 @@ +#ifndef SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { + +struct json_character_block { + static simdjson_inline json_character_block classify(const simd::simd8x64& in); + + simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; } + simdjson_inline uint64_t op() const noexcept { return _op; } + simdjson_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); } + + uint64_t _whitespace; + uint64_t _op; +}; + +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage1/amalgamated.h b/contrib/libs/simdjson/src/generic/stage1/amalgamated.h new file mode 100644 index 000000000000..ed083677fd08 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/amalgamated.h @@ -0,0 +1,13 @@ +// Stuff other things depend on +#include +#include +#include +#include +#include +#include + +// All other declarations +#include +#include +#include +#include diff --git a/contrib/libs/simdjson/src/generic/stage1/base.h b/contrib/libs/simdjson/src/generic/stage1/base.h new file mode 100644 index 000000000000..a6413fadba44 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/base.h @@ -0,0 +1,35 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE1_BASE_H +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage1 { + +class bit_indexer; +template +struct buf_block_reader; +struct json_block; +class json_minifier; +class json_scanner; +struct json_string_block; +class json_string_scanner; +class json_structural_indexer; + +} // namespace stage1 + +namespace utf8_validation { +struct utf8_checker; +} // namespace utf8_validation + +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE1_BASE_H diff --git a/contrib/libs/simdjson/src/generic/stage1/buf_block_reader.h b/contrib/libs/simdjson/src/generic/stage1/buf_block_reader.h new file mode 100644 index 000000000000..b3e4ec7d5903 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/buf_block_reader.h @@ -0,0 +1,116 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage1 { + +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdjson_inline size_t block_index(); + simdjson_inline bool has_full_block() const; + simdjson_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdjson_inline size_t get_remainder(uint8_t *dst) const; + simdjson_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdjson_unused static char * format_input_text_64(const uint8_t *text) { + static char buf[sizeof(simd8x64) + 1]; + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdjson_unused static char * format_input_text(const simd8x64& in) { + static char buf[sizeof(simd8x64) + 1]; + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdjson_unused static char * format_input_text(const simd8x64& in, uint64_t mask) { + static char buf[sizeof(simd8x64) + 1]; + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] <= ' ') { buf[i] = '_'; } + if (!(mask & (size_t(1) << i))) { buf[i] = ' '; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdjson_unused static char * format_mask(uint64_t mask) { + static char buf[sizeof(simd8x64) + 1]; + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdjson_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +simdjson_inline size_t buf_block_reader::block_index() { return idx; } + +template +simdjson_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdjson_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdjson_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdjson_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // namespace stage1 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage1/dependencies.h b/contrib/libs/simdjson/src/generic/stage1/dependencies.h new file mode 100644 index 000000000000..dfd8d8fa93e0 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/dependencies.h @@ -0,0 +1,4 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE1_DEPENDENCIES_H +#define SIMDJSON_SRC_GENERIC_STAGE1_DEPENDENCIES_H + +#endif // SIMDJSON_SRC_GENERIC_STAGE1_DEPENDENCIES_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage1/find_next_document_index.h b/contrib/libs/simdjson/src/generic/stage1/find_next_document_index.h new file mode 100644 index 000000000000..162595438abd --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/find_next_document_index.h @@ -0,0 +1,105 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage1 { + +/** + * This algorithm is used to quickly identify the last structural position that + * makes up a complete document. + * + * It does this by going backwards and finding the last *document boundary* (a + * place where one value follows another without a comma between them). If the + * last document (the characters after the boundary) has an equal number of + * start and end brackets, it is considered complete. + * + * Simply put, we iterate over the structural characters, starting from + * the end. We consider that we found the end of a JSON document when the + * first element of the pair is NOT one of these characters: '{' '[' ':' ',' + * and when the second element is NOT one of these characters: '}' ']' ':' ','. + * + * This simple comparison works most of the time, but it does not cover cases + * where the batch's structural indexes contain a perfect amount of documents. + * In such a case, we do not have access to the structural index which follows + * the last document, therefore, we do not have access to the second element in + * the pair, and that means we cannot identify the last document. To fix this + * issue, we keep a count of the open and closed curly/square braces we found + * while searching for the pair. When we find a pair AND the count of open and + * closed curly/square braces is the same, we know that we just passed a + * complete document, therefore the last json buffer location is the end of the + * batch. + */ +simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) { + // Variant: do not count separately, just figure out depth + if(parser.n_structural_indexes == 0) { return 0; } + auto arr_cnt = 0; + auto obj_cnt = 0; + for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { + auto idxb = parser.structural_indexes[i]; + switch (parser.buf[idxb]) { + case ':': + case ',': + continue; + case '}': + obj_cnt--; + continue; + case ']': + arr_cnt--; + continue; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + auto idxa = parser.structural_indexes[i - 1]; + switch (parser.buf[idxa]) { + case '{': + case '[': + case ':': + case ',': + continue; + } + // Last document is complete, so the next document will appear after! + if (!arr_cnt && !obj_cnt) { + return parser.n_structural_indexes; + } + // Last document is incomplete; mark the document at i + 1 as the next one + return i; + } + // If we made it to the end, we want to finish counting to see if we have a full document. + switch (parser.buf[parser.structural_indexes[0]]) { + case '}': + obj_cnt--; + break; + case ']': + arr_cnt--; + break; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + if (!arr_cnt && !obj_cnt) { + // We have a complete document. + return parser.n_structural_indexes; + } + return 0; +} + +} // namespace stage1 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage1/json_escape_scanner.h b/contrib/libs/simdjson/src/generic/stage1/json_escape_scanner.h new file mode 100644 index 000000000000..ee58e1ce5c84 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/json_escape_scanner.h @@ -0,0 +1,151 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage1 { + +/** + * Scans for escape characters in JSON, taking care with multiple backslashes (\\n vs. \n). + */ +struct json_escape_scanner { + /** The actual escape characters (the backslashes themselves). */ + uint64_t next_is_escaped = 0ULL; + + struct escaped_and_escape { + /** + * Mask of escaped characters. + * + * ``` + * \n \\n \\\n \\\\n \ + * 0100100010100101000 + * n \ \ n \ \ + * ``` + */ + uint64_t escaped; + /** + * Mask of escape characters. + * + * ``` + * \n \\n \\\n \\\\n \ + * 1001000101001010001 + * \ \ \ \ \ \ \ + * ``` + */ + uint64_t escape; + }; + + /** + * Get a mask of both escape and escaped characters (the characters following a backslash). + * + * @param potential_escape A mask of the character that can escape others (but could be + * escaped itself). e.g. block.eq('\\') + */ + simdjson_really_inline escaped_and_escape next(uint64_t backslash) noexcept { + +#if !SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT + if (!backslash) { return {next_escaped_without_backslashes(), 0}; } +#endif + + // | | Mask (shows characters instead of 1's) | Depth | Instructions | + // |--------------------------------|----------------------------------------|-------|---------------------| + // | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` | | | + // | | ` even odd even odd odd` | | | + // | potential_escape | ` \ \\\ \\\ \\\\ \\\\ \\\` | 1 | 1 (backslash & ~first_is_escaped) + // | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 5 | 5 (next_escape_and_terminal_code()) + // | escaped | `\ \ n \ n \ \ \ \ \ ` X | 6 | 7 (escape_and_terminal_code ^ (potential_escape | first_is_escaped)) + // | escape | ` \ \ \ \ \ \ \ \ \ \` | 6 | 8 (escape_and_terminal_code & backslash) + // | first_is_escaped | `\ ` | 7 (*) | 9 (escape >> 63) () + // (*) this is not needed until the next iteration + uint64_t escape_and_terminal_code = next_escape_and_terminal_code(backslash & ~this->next_is_escaped); + uint64_t escaped = escape_and_terminal_code ^ (backslash | this->next_is_escaped); + uint64_t escape = escape_and_terminal_code & backslash; + this->next_is_escaped = escape >> 63; + return {escaped, escape}; + } + +private: + static constexpr const uint64_t ODD_BITS = 0xAAAAAAAAAAAAAAAAULL; + + simdjson_really_inline uint64_t next_escaped_without_backslashes() noexcept { + uint64_t escaped = this->next_is_escaped; + this->next_is_escaped = 0; + return escaped; + } + + /** + * Returns a mask of the next escape characters (masking out escaped backslashes), along with + * any non-backslash escape codes. + * + * \n \\n \\\n \\\\n returns: + * \n \ \ \n \ \ + * 11 100 1011 10100 + * + * You are expected to mask out the first bit yourself if the previous block had a trailing + * escape. + * + * & the result with potential_escape to get just the escape characters. + * ^ the result with (potential_escape | first_is_escaped) to get escaped characters. + */ + static simdjson_really_inline uint64_t next_escape_and_terminal_code(uint64_t potential_escape) noexcept { + // If we were to just shift and mask out any odd bits, we'd actually get a *half* right answer: + // any even-aligned backslash runs would be correct! Odd-aligned backslash runs would be + // inverted (\\\ would be 010 instead of 101). + // + // ``` + // string: | ____\\\\_\\\\_____ | + // maybe_escaped | ODD | \ \ \ \ | + // even-aligned ^^^ ^^^^ odd-aligned + // ``` + // + // Taking that into account, our basic strategy is: + // + // 1. Use subtraction to produce a mask with 1's for even-aligned runs and 0's for + // odd-aligned runs. + // 2. XOR all odd bits, which masks out the odd bits in even-aligned runs, and brings IN the + // odd bits in odd-aligned runs. + // 3. & with backslash to clean up any stray bits. + // runs are set to 0, and then XORing with "odd": + // + // | | Mask (shows characters instead of 1's) | Instructions | + // |--------------------------------|----------------------------------------|---------------------| + // | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` | + // | | ` even odd even odd odd` | + // | maybe_escaped | ` n \\n \\n \\\_ \\\_ \\` X | 1 (potential_escape << 1) + // | maybe_escaped_and_odd | ` \n_ \\n _ \\\n_ _ \\\__ _\\\_ \\\` | 1 (maybe_escaped | odd) + // | even_series_codes_and_odd | ` n_\\\ _ n_ _\\\\ _ _ ` | 1 (maybe_escaped_and_odd - potential_escape) + // | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 1 (^ odd) + // + + // Escaped characters are characters following an escape. + uint64_t maybe_escaped = potential_escape << 1; + + // To distinguish odd from even escape sequences, therefore, we turn on any *starting* + // escapes that are on an odd byte. (We actually bring in all odd bits, for speed.) + // - Odd runs of backslashes are 0000, and the code at the end ("n" in \n or \\n) is 1. + // - Odd runs of backslashes are 1111, and the code at the end ("n" in \n or \\n) is 0. + // - All other odd bytes are 1, and even bytes are 0. + uint64_t maybe_escaped_and_odd_bits = maybe_escaped | ODD_BITS; + uint64_t even_series_codes_and_odd_bits = maybe_escaped_and_odd_bits - potential_escape; + + // Now we flip all odd bytes back with xor. This: + // - Makes odd runs of backslashes go from 0000 to 1010 + // - Makes even runs of backslashes go from 1111 to 1010 + // - Sets actually-escaped codes to 1 (the n in \n and \\n: \n = 11, \\n = 100) + // - Resets all other bytes to 0 + return even_series_codes_and_odd_bits ^ ODD_BITS; + } +}; + +} // namespace stage1 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage1/json_minifier.h b/contrib/libs/simdjson/src/generic/stage1/json_minifier.h new file mode 100644 index 000000000000..22ddaf2dd837 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/json_minifier.h @@ -0,0 +1,104 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H +#include +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// This file contains the common code every implementation uses in stage1 +// It is intended to be included multiple times and compiled multiple times +// We assume the file in which it is included already includes +// "simdjson/stage1.h" (this simplifies amalgation) + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage1 { + +class json_minifier { +public: + template + static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept; + +private: + simdjson_inline json_minifier(uint8_t *_dst) + : dst{_dst} + {} + template + simdjson_inline void step(const uint8_t *block_buf, buf_block_reader &reader) noexcept; + simdjson_inline void next(const simd::simd8x64& in, const json_block& block); + simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len); + json_scanner scanner{}; + uint8_t *dst; +}; + +simdjson_inline void json_minifier::next(const simd::simd8x64& in, const json_block& block) { + uint64_t mask = block.whitespace(); + dst += in.compress(mask, dst); +} + +simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) { + error_code error = scanner.finish(); + if (error) { dst_len = 0; return error; } + dst_len = dst - dst_start; + return SUCCESS; +} + +template<> +simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept { + simd::simd8x64 in_1(block_buf); + simd::simd8x64 in_2(block_buf+64); + json_block block_1 = scanner.next(in_1); + json_block block_2 = scanner.next(in_2); + this->next(in_1, block_1); + this->next(in_2, block_2); + reader.advance(); +} + +template<> +simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept { + simd::simd8x64 in_1(block_buf); + json_block block_1 = scanner.next(in_1); + this->next(block_buf, block_1); + reader.advance(); +} + +template +error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept { + buf_block_reader reader(buf, len); + json_minifier minifier(dst); + + // Index the first n-1 blocks + while (reader.has_full_block()) { + minifier.step(reader.full_block(), reader); + } + + // Index the last (remainder) block, padded with spaces + uint8_t block[STEP_SIZE]; + size_t remaining_bytes = reader.get_remainder(block); + if (remaining_bytes > 0) { + // We do not want to write directly to the output stream. Rather, we write + // to a local buffer (for safety). + uint8_t out_block[STEP_SIZE]; + uint8_t * const guarded_dst{minifier.dst}; + minifier.dst = out_block; + minifier.step(block, reader); + size_t to_write = minifier.dst - out_block; + // In some cases, we could be enticed to consider the padded spaces + // as part of the string. This is fine as long as we do not write more + // than we consumed. + if(to_write > remaining_bytes) { to_write = remaining_bytes; } + memcpy(guarded_dst, out_block, to_write); + minifier.dst = guarded_dst + to_write; + } + return minifier.finish(dst, dst_len); +} + +} // namespace stage1 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage1/json_scanner.h b/contrib/libs/simdjson/src/generic/stage1/json_scanner.h new file mode 100644 index 000000000000..9ef41fb349ee --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/json_scanner.h @@ -0,0 +1,168 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H +#include +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage1 { + +/** + * A block of scanned json, with information on operators and scalars. + * + * We seek to identify pseudo-structural characters. Anything that is inside + * a string must be omitted (hence & ~_string.string_tail()). + * Otherwise, pseudo-structural characters come in two forms. + * 1. We have the structural characters ([,],{,},:, comma). The + * term 'structural character' is from the JSON RFC. + * 2. We have the 'scalar pseudo-structural characters'. + * Scalars are quotes, and any character except structural characters and white space. + * + * To identify the scalar pseudo-structural characters, we must look at what comes + * before them: it must be a space, a quote or a structural characters. + * Starting with simdjson v0.3, we identify them by + * negation: we identify everything that is followed by a non-quote scalar, + * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'. + */ +struct json_block { +public: + // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017 + simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) : + _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {} + simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) : + _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {} + + /** + * The start of structurals. + * In simdjson prior to v0.3, these were called the pseudo-structural characters. + **/ + simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); } + /** All JSON whitespace (i.e. not in a string) */ + simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); } + + // Helpers + + /** Whether the given characters are inside a string (only works on non-quotes) */ + simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); } + /** Whether the given characters are outside a string (only works on non-quotes) */ + simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); } + + // string and escape characters + json_string_block _string; + // whitespace, structural characters ('operators'), scalars + json_character_block _characters; + // whether the previous character was a scalar + uint64_t _follows_potential_nonquote_scalar; +private: + // Potential structurals (i.e. disregarding strings) + + /** + * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc". + * They may reside inside a string. + **/ + simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); } + /** + * The start of non-operator runs, like 123, true and "abc". + * It main reside inside a string. + **/ + simdjson_inline uint64_t potential_scalar_start() const noexcept { + // The term "scalar" refers to anything except structural characters and white space + // (so letters, numbers, quotes). + // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space + // then we know that it is irrelevant structurally. + return _characters.scalar() & ~follows_potential_scalar(); + } + /** + * Whether the given character is immediately after a non-operator like 123, true. + * The characters following a quote are not included. + */ + simdjson_inline uint64_t follows_potential_scalar() const noexcept { + // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character + // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a + // white space. + // It is understood that within quoted region, anything at all could be marked (irrelevant). + return _follows_potential_nonquote_scalar; + } +}; + +/** + * Scans JSON for important bits: structural characters or 'operators', strings, and scalars. + * + * The scanner starts by calculating two distinct things: + * - string characters (taking \" into account) + * - structural characters or 'operators' ([]{},:, comma) + * and scalars (runs of non-operators like 123, true and "abc") + * + * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel: + * in particular, the operator/scalar bit will find plenty of things that are actually part of + * strings. When we're done, json_block will fuse the two together by masking out tokens that are + * part of a string. + */ +class json_scanner { +public: + json_scanner() = default; + simdjson_inline json_block next(const simd::simd8x64& in); + // Returns either UNCLOSED_STRING or SUCCESS + simdjson_inline error_code finish(); + +private: + // Whether the last character of the previous iteration is part of a scalar token + // (anything except whitespace or a structural character/'operator'). + uint64_t prev_scalar = 0ULL; + json_string_scanner string_scanner{}; +}; + + +// +// Check if the current character immediately follows a matching character. +// +// For example, this checks for quotes with backslashes in front of them: +// +// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash); +// +simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) { + const uint64_t result = match << 1 | overflow; + overflow = match >> 63; + return result; +} + +simdjson_inline json_block json_scanner::next(const simd::simd8x64& in) { + json_string_block strings = string_scanner.next(in); + // identifies the white-space and the structural characters + json_character_block characters = json_character_block::classify(in); + // The term "scalar" refers to anything except structural characters and white space + // (so letters, numbers, quotes). + // We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers). + // + // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon) + // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential + // pseudo-structural character just like we would if we had ' "a string" true '; otherwise we + // may need to add an extra check when parsing strings. + // + // Performance: there are many ways to skin this cat. + const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote(); + uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar); + // We are returning a function-local object so either we get a move constructor + // or we get copy elision. + return json_block( + strings,// strings is a function-local object so either it moves or the copy is elided. + characters, + follows_nonquote_scalar + ); +} + +simdjson_inline error_code json_scanner::finish() { + return string_scanner.finish(); +} + +} // namespace stage1 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage1/json_string_scanner.h b/contrib/libs/simdjson/src/generic/stage1/json_string_scanner.h new file mode 100644 index 000000000000..fb71b99a2c4f --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/json_string_scanner.h @@ -0,0 +1,99 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage1 { + +struct json_string_block { + // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017 + simdjson_really_inline json_string_block(uint64_t escaped, uint64_t quote, uint64_t in_string) : + _escaped(escaped), _quote(quote), _in_string(in_string) {} + + // Escaped characters (characters following an escape() character) + simdjson_really_inline uint64_t escaped() const { return _escaped; } + // Real (non-backslashed) quotes + simdjson_really_inline uint64_t quote() const { return _quote; } + // Only characters inside the string (not including the quotes) + simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; } + // Return a mask of whether the given characters are inside a string (only works on non-quotes) + simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; } + // Return a mask of whether the given characters are inside a string (only works on non-quotes) + simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; } + // Tail of string (everything except the start quote) + simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; } + + // escaped characters (backslashed--does not include the hex characters after \u) + uint64_t _escaped; + // real quotes (non-escaped ones) + uint64_t _quote; + // string characters (includes start quote but not end quote) + uint64_t _in_string; +}; + +// Scans blocks for string characters, storing the state necessary to do so +class json_string_scanner { +public: + simdjson_really_inline json_string_block next(const simd::simd8x64& in); + // Returns either UNCLOSED_STRING or SUCCESS + simdjson_really_inline error_code finish(); + +private: + // Scans for escape characters + json_escape_scanner escape_scanner{}; + // Whether the last iteration was still inside a string (all 1's = true, all 0's = false). + uint64_t prev_in_string = 0ULL; +}; + +// +// Return a mask of all string characters plus end quotes. +// +// prev_escaped is overflow saying whether the next character is escaped. +// prev_in_string is overflow saying whether we're still in a string. +// +// Backslash sequences outside of quotes will be detected in stage 2. +// +simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64& in) { + const uint64_t backslash = in.eq('\\'); + const uint64_t escaped = escape_scanner.next(backslash).escaped; + const uint64_t quote = in.eq('"') & ~escaped; + + // + // prefix_xor flips on bits inside the string (and flips off the end quote). + // + // Then we xor with prev_in_string: if we were in a string already, its effect is flipped + // (characters inside strings are outside, and characters outside strings are inside). + // + const uint64_t in_string = prefix_xor(quote) ^ prev_in_string; + + // + // Check if we're still in a string at the end of the box so the next block will know + // + prev_in_string = uint64_t(static_cast(in_string) >> 63); + + // Use ^ to turn the beginning quote off, and the end quote on. + + // We are returning a function-local object so either we get a move constructor + // or we get copy elision. + return json_string_block(escaped, quote, in_string); +} + +simdjson_really_inline error_code json_string_scanner::finish() { + if (prev_in_string) { + return UNCLOSED_STRING; + } + return SUCCESS; +} + +} // namespace stage1 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage1/json_structural_indexer.h b/contrib/libs/simdjson/src/generic/stage1/json_structural_indexer.h new file mode 100644 index 000000000000..d9370b0c66c7 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/json_structural_indexer.h @@ -0,0 +1,358 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H +#include +#include +#include +#include +#include +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// This file contains the common code every implementation uses in stage1 +// It is intended to be included multiple times and compiled multiple times +// We assume the file in which it is included already includes +// "simdjson/stage1.h" (this simplifies amalgation) + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage1 { + +class bit_indexer { +public: + uint32_t *tail; + + simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {} + +#if SIMDJSON_PREFER_REVERSE_BITS + /** + * ARM lacks a fast trailing zero instruction, but it has a fast + * bit reversal instruction and a fast leading zero instruction. + * Thus it may be profitable to reverse the bits (once) and then + * to rely on a sequence of instructions that call the leading + * zero instruction. + * + * Performance notes: + * The chosen routine is not optimal in terms of data dependency + * since zero_leading_bit might require two instructions. However, + * it tends to minimize the total number of instructions which is + * beneficial. + */ + simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } +#else + /** + * Under recent x64 systems, we often have both a fast trailing zero + * instruction and a fast 'clear-lower-bit' instruction so the following + * algorithm can be competitive. + */ + + simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) { + this->tail[i] = idx + trailing_zeroes(bits); + bits = clear_lowest_bit(bits); + } +#endif // SIMDJSON_PREFER_REVERSE_BITS + + template + simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) { + write_index(idx, bits, START); + SIMDJSON_IF_CONSTEXPR (N > 1) { + write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits); + } + return START+N; + } + + template + simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) { + write_indexes(idx, bits); + SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) { + if (simdjson_unlikely((START+STEP) < cnt)) { + write_indexes_stepped<(START+STEP(idx, bits, cnt); + } + } + return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP; + } + + // flatten out values in 'bits' assuming that they are are to have values of idx + // plus their position in the bitvector, and store these indexes at + // base_ptr[base] incrementing base as we go + // will potentially store extra values beyond end of valid bits, so base_ptr + // needs to be large enough to handle this + // + // If the kernel sets SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER, then it + // will provide its own version of the code. +#ifdef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER + simdjson_inline void write(uint32_t idx, uint64_t bits); +#else + simdjson_inline void write(uint32_t idx, uint64_t bits) { + // In some instances, the next branch is expensive because it is mispredicted. + // Unfortunately, in other cases, + // it helps tremendously. + if (bits == 0) + return; + + int cnt = static_cast(count_ones(bits)); + +#if SIMDJSON_PREFER_REVERSE_BITS + bits = reverse_bits(bits); +#endif +#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP + static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP; +#else + static constexpr const int STEP = 4; +#endif + static constexpr const int STEP_UNTIL = 24; + + write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt); + SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) { + if (simdjson_unlikely(STEP_UNTIL < cnt)) { + for (int i=STEP_UNTIL; itail += cnt; + } +#endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER + +}; + +class json_structural_indexer { +public: + /** + * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. + * + * @param partial Setting the partial parameter to true allows the find_structural_bits to + * tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If + * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8. + */ + template + static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept; + +private: + simdjson_inline json_structural_indexer(uint32_t *structural_indexes); + template + simdjson_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; + simdjson_inline void next(const simd::simd8x64& in, const json_block& block, size_t idx); + simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial); + + json_scanner scanner{}; + utf8_checker checker{}; + bit_indexer indexer; + uint64_t prev_structurals = 0; + uint64_t unescaped_chars_error = 0; +}; + +simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {} + +// Skip the last character if it is partial +simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) { + if (simdjson_unlikely(len < 3)) { + switch (len) { + case 2: + if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left + return len; + case 1: + if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + return len; + case 0: + return len; + } + } + if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left + if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left + return len; +} + +// +// PERF NOTES: +// We pipe 2 inputs through these stages: +// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load +// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. +// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path. +// The output of step 1 depends entirely on this information. These functions don't quite use +// up enough CPU: the second half of the functions is highly serial, only using 1 execution core +// at a time. The second input's scans has some dependency on the first ones finishing it, but +// they can make a lot of progress before they need that information. +// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that +// to finish: utf-8 checks and generating the output from the last iteration. +// +// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all +// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough +// workout. +// +template +error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept { + if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; } + // We guard the rest of the code so that we can assume that len > 0 throughout. + if (len == 0) { return EMPTY; } + if (is_streaming(partial)) { + len = trim_partial_utf8(buf, len); + // If you end up with an empty window after trimming + // the partial UTF-8 bytes, then chances are good that you + // have an UTF-8 formatting error. + if(len == 0) { return UTF8_ERROR; } + } + buf_block_reader reader(buf, len); + json_structural_indexer indexer(parser.structural_indexes.get()); + + // Read all but the last block + while (reader.has_full_block()) { + indexer.step(reader.full_block(), reader); + } + // Take care of the last block (will always be there unless file is empty which is + // not supposed to happen.) + uint8_t block[STEP_SIZE]; + if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; } + indexer.step(block, reader); + return indexer.finish(parser, reader.block_index(), len, partial); +} + +template<> +simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept { + simd::simd8x64 in_1(block); + simd::simd8x64 in_2(block+64); + json_block block_1 = scanner.next(in_1); + json_block block_2 = scanner.next(in_2); + this->next(in_1, block_1, reader.block_index()); + this->next(in_2, block_2, reader.block_index()+64); + reader.advance(); +} + +template<> +simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept { + simd::simd8x64 in_1(block); + json_block block_1 = scanner.next(in_1); + this->next(in_1, block_1, reader.block_index()); + reader.advance(); +} + +simdjson_inline void json_structural_indexer::next(const simd::simd8x64& in, const json_block& block, size_t idx) { + uint64_t unescaped = in.lteq(0x1F); +#if SIMDJSON_UTF8VALIDATION + checker.check_next_input(in); +#endif + indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser + prev_structurals = block.structural_start(); + unescaped_chars_error |= block.non_quote_inside_string(unescaped); +} + +simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) { + // Write out the final iteration's structurals + indexer.write(uint32_t(idx-64), prev_structurals); + error_code error = scanner.finish(); + // We deliberately break down the next expression so that it is + // human readable. + const bool should_we_exit = is_streaming(partial) ? + ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING + : (error != SUCCESS); // if partial is false, we must have SUCCESS + const bool have_unclosed_string = (error == UNCLOSED_STRING); + if (simdjson_unlikely(should_we_exit)) { return error; } + + if (unescaped_chars_error) { + return UNESCAPED_CHARS; + } + parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); + /*** + * The On-Demand API requires special padding. + * + * This is related to https://github.com/simdjson/simdjson/issues/906 + * Basically, we want to make sure that if the parsing continues beyond the last (valid) + * structural character, it quickly stops. + * Only three structural characters can be repeated without triggering an error in JSON: [,] and }. + * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing + * continues, then it must be [,] or }. + * Suppose it is ] or }. We backtrack to the first character, what could it be that would + * not trigger an error? It could be ] or } but no, because you can't start a document that way. + * It can't be a comma, a colon or any simple value. So the only way we could continue is + * if the repeated character is [. But if so, the document must start with [. But if the document + * starts with [, it should end with ]. If we enforce that rule, then we would get + * ][[ which is invalid. + * + * This is illustrated with the test array_iterate_unclosed_error() on the following input: + * R"({ "a": [,,)" + **/ + parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final + parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len); + parser.structural_indexes[parser.n_structural_indexes + 2] = 0; + parser.next_structural_index = 0; + // a valid JSON file cannot have zero structural indexes - we should have found something + if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { + return EMPTY; + } + if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { + return UNEXPECTED_ERROR; + } + if (partial == stage1_mode::streaming_partial) { + // If we have an unclosed string, then the last structural + // will be the quote and we want to make sure to omit it. + if(have_unclosed_string) { + parser.n_structural_indexes--; + // a valid JSON file cannot have zero structural indexes - we should have found something + if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; } + } + // We truncate the input to the end of the last complete document (or zero). + auto new_structural_indexes = find_next_document_index(parser); + if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { + if(parser.structural_indexes[0] == 0) { + // If the buffer is partial and we started at index 0 but the document is + // incomplete, it's too big to parse. + return CAPACITY; + } else { + // It is possible that the document could be parsed, we just had a lot + // of white space. + parser.n_structural_indexes = 0; + return EMPTY; + } + } + + parser.n_structural_indexes = new_structural_indexes; + } else if (partial == stage1_mode::streaming_final) { + if(have_unclosed_string) { parser.n_structural_indexes--; } + // We truncate the input to the end of the last complete document (or zero). + // Because partial == stage1_mode::streaming_final, it means that we may + // silently ignore trailing garbage. Though it sounds bad, we do it + // deliberately because many people who have streams of JSON documents + // will truncate them for processing. E.g., imagine that you are uncompressing + // the data from a size file or receiving it in chunks from the network. You + // may not know where exactly the last document will be. Meanwhile the + // document_stream instances allow people to know the JSON documents they are + // parsing (see the iterator.source() method). + parser.n_structural_indexes = find_next_document_index(parser); + // We store the initial n_structural_indexes so that the client can see + // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes, + // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len, + // otherwise, it will copy some prior index. + parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes]; + // This next line is critical, do not change it unless you understand what you are + // doing. + parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); + if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { + // We tolerate an unclosed string at the very end of the stream. Indeed, users + // often load their data in bulk without being careful and they want us to ignore + // the trailing garbage. + return EMPTY; + } + } + checker.check_eof(); + return checker.errors(); +} + +} // namespace stage1 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to. +#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER + +#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H diff --git a/contrib/libs/simdjson/src/generic/stage1/utf8_lookup4_algorithm.h b/contrib/libs/simdjson/src/generic/stage1/utf8_lookup4_algorithm.h new file mode 100644 index 000000000000..1a196159b888 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/utf8_lookup4_algorithm.h @@ -0,0 +1,209 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace utf8_validation { + +using namespace simd; + + simdjson_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdjson_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = must_be_2_3_continuation(prev2, prev3); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdjson_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ +#if SIMDJSON_IMPLEMENTATION_ICELAKE + static const uint8_t max_array[64] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1 + }; +#else + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1 + }; +#endif + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } + + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdjson_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdjson_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } + + simdjson_inline void check_next_input(const simd8x64& input) { + if(simdjson_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 1) + ||(simd8x64::NUM_CHUNKS == 2) + || (simd8x64::NUM_CHUNKS == 4), + "We support one, two or four chunks per 64-byte block."); + SIMDJSON_IF_CONSTEXPR (simd8x64::NUM_CHUNKS == 1) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + } else SIMDJSON_IF_CONSTEXPR (simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else SIMDJSON_IF_CONSTEXPR (simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + } + } + // do not forget to call check_eof! + simdjson_inline error_code errors() { + return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS; + } + + }; // struct utf8_checker +} // namespace utf8_validation + +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage1/utf8_validator.h b/contrib/libs/simdjson/src/generic/stage1/utf8_validator.h new file mode 100644 index 000000000000..ffc651ad91fe --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage1/utf8_validator.h @@ -0,0 +1,45 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H +#include +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage1 { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return c.errors() == error_code::SUCCESS; +} + +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); +} + +} // namespace stage1 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage2/amalgamated.h b/contrib/libs/simdjson/src/generic/stage2/amalgamated.h new file mode 100644 index 000000000000..43e1d7c682c4 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage2/amalgamated.h @@ -0,0 +1,10 @@ +// Stuff other things depend on +#include +#include +#include + +// All other declarations +#include +#include +#include +#include diff --git a/contrib/libs/simdjson/src/generic/stage2/base.h b/contrib/libs/simdjson/src/generic/stage2/base.h new file mode 100644 index 000000000000..b2e987c40b8f --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage2/base.h @@ -0,0 +1,23 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE2_BASE_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE2_BASE_H +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage2 { + +class json_iterator; +class structural_iterator; +struct tape_builder; +struct tape_writer; + +} // namespace stage2 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE2_BASE_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage2/dependencies.h b/contrib/libs/simdjson/src/generic/stage2/dependencies.h new file mode 100644 index 000000000000..b5d502d2dee8 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage2/dependencies.h @@ -0,0 +1,7 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE2_DEPENDENCIES_H +#define SIMDJSON_SRC_GENERIC_STAGE2_DEPENDENCIES_H + +#include +#include + +#endif // SIMDJSON_SRC_GENERIC_STAGE2_DEPENDENCIES_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage2/json_iterator.h b/contrib/libs/simdjson/src/generic/stage2/json_iterator.h new file mode 100644 index 000000000000..810e8fc52526 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage2/json_iterator.h @@ -0,0 +1,328 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H +#include +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage2 { + +class json_iterator { +public: + const uint8_t* const buf; + uint32_t *next_structural; + dom_parser_implementation &dom_parser; + uint32_t depth{0}; + + /** + * Walk the JSON document. + * + * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as + * the first parameter; some callbacks have other parameters as well: + * + * - visit_document_start() - at the beginning. + * - visit_document_end() - at the end (if things were successful). + * + * - visit_array_start() - at the start `[` of a non-empty array. + * - visit_array_end() - at the end `]` of a non-empty array. + * - visit_empty_array() - when an empty array is encountered. + * + * - visit_object_end() - at the start `]` of a non-empty object. + * - visit_object_start() - at the end `]` of a non-empty object. + * - visit_empty_object() - when an empty object is encountered. + * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is + * guaranteed to point at the first quote of the string (`"key"`). + * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null. + * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null. + * + * - increment_count(iter) - each time a value is found in an array or object. + */ + template + simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept; + + /** + * Create an iterator capable of walking a JSON document. + * + * The document must have already passed through stage 1. + */ + simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index); + + /** + * Look at the next token. + * + * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)). + * + * They may include invalid JSON as well (such as `1.2.3` or `ture`). + */ + simdjson_inline const uint8_t *peek() const noexcept; + /** + * Advance to the next token. + * + * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)). + * + * They may include invalid JSON as well (such as `1.2.3` or `ture`). + */ + simdjson_inline const uint8_t *advance() noexcept; + /** + * Get the remaining length of the document, from the start of the current token. + */ + simdjson_inline size_t remaining_len() const noexcept; + /** + * Check if we are at the end of the document. + * + * If this is true, there are no more tokens. + */ + simdjson_inline bool at_eof() const noexcept; + /** + * Check if we are at the beginning of the document. + */ + simdjson_inline bool at_beginning() const noexcept; + simdjson_inline uint8_t last_structural() const noexcept; + + /** + * Log that a value has been found. + * + * Set LOG_ENABLED=true in logger.h to see logging. + */ + simdjson_inline void log_value(const char *type) const noexcept; + /** + * Log the start of a multipart value. + * + * Set LOG_ENABLED=true in logger.h to see logging. + */ + simdjson_inline void log_start_value(const char *type) const noexcept; + /** + * Log the end of a multipart value. + * + * Set LOG_ENABLED=true in logger.h to see logging. + */ + simdjson_inline void log_end_value(const char *type) const noexcept; + /** + * Log an error. + * + * Set LOG_ENABLED=true in logger.h to see logging. + */ + simdjson_inline void log_error(const char *error) const noexcept; + + template + simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept; + template + simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept; +}; + +template +simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept { + logger::log_start(); + + // + // Start the document + // + if (at_eof()) { return EMPTY; } + log_start_value("document"); + SIMDJSON_TRY( visitor.visit_document_start(*this) ); + + // + // Read first value + // + { + auto value = advance(); + + // Make sure the outer object or array is closed before continuing; otherwise, there are ways we + // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906 + if (!STREAMING) { + switch (*value) { + case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break; + case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break; + } + } + + switch (*value) { + case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin; + case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin; + default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break; + } + } + goto document_end; + +// +// Object parser states +// +object_begin: + log_start_value("object"); + depth++; + if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } + dom_parser.is_array[depth] = false; + SIMDJSON_TRY( visitor.visit_object_start(*this) ); + + { + auto key = advance(); + if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; } + SIMDJSON_TRY( visitor.increment_count(*this) ); + SIMDJSON_TRY( visitor.visit_key(*this, key) ); + } + +object_field: + if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; } + { + auto value = advance(); + switch (*value) { + case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin; + case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin; + default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break; + } + } + +object_continue: + switch (*advance()) { + case ',': + SIMDJSON_TRY( visitor.increment_count(*this) ); + { + auto key = advance(); + if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; } + SIMDJSON_TRY( visitor.visit_key(*this, key) ); + } + goto object_field; + case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end; + default: log_error("No comma between object fields"); return TAPE_ERROR; + } + +scope_end: + depth--; + if (depth == 0) { goto document_end; } + if (dom_parser.is_array[depth]) { goto array_continue; } + goto object_continue; + +// +// Array parser states +// +array_begin: + log_start_value("array"); + depth++; + if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } + dom_parser.is_array[depth] = true; + SIMDJSON_TRY( visitor.visit_array_start(*this) ); + SIMDJSON_TRY( visitor.increment_count(*this) ); + +array_value: + { + auto value = advance(); + switch (*value) { + case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin; + case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin; + default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break; + } + } + +array_continue: + switch (*advance()) { + case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value; + case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end; + default: log_error("Missing comma between array values"); return TAPE_ERROR; + } + +document_end: + log_end_value("document"); + SIMDJSON_TRY( visitor.visit_document_end(*this) ); + + dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]); + + // If we didn't make it to the end, it's an error + if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) { + log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); + return TAPE_ERROR; + } + + return SUCCESS; + +} // walk_document() + +simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index) + : buf{_dom_parser.buf}, + next_structural{&_dom_parser.structural_indexes[start_structural_index]}, + dom_parser{_dom_parser} { +} + +simdjson_inline const uint8_t *json_iterator::peek() const noexcept { + return &buf[*(next_structural)]; +} +simdjson_inline const uint8_t *json_iterator::advance() noexcept { + return &buf[*(next_structural++)]; +} +simdjson_inline size_t json_iterator::remaining_len() const noexcept { + return dom_parser.len - *(next_structural-1); +} + +simdjson_inline bool json_iterator::at_eof() const noexcept { + return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes]; +} +simdjson_inline bool json_iterator::at_beginning() const noexcept { + return next_structural == dom_parser.structural_indexes.get(); +} +simdjson_inline uint8_t json_iterator::last_structural() const noexcept { + return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]]; +} + +simdjson_inline void json_iterator::log_value(const char *type) const noexcept { + logger::log_line(*this, "", type, ""); +} + +simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept { + logger::log_line(*this, "+", type, ""); + if (logger::LOG_ENABLED) { logger::log_depth++; } +} + +simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept { + if (logger::LOG_ENABLED) { logger::log_depth--; } + logger::log_line(*this, "-", type, ""); +} + +simdjson_inline void json_iterator::log_error(const char *error) const noexcept { + logger::log_line(*this, "", "ERROR", error); +} + +template +simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept { + switch (*value) { + case '"': return visitor.visit_root_string(*this, value); + case 't': return visitor.visit_root_true_atom(*this, value); + case 'f': return visitor.visit_root_false_atom(*this, value); + case 'n': return visitor.visit_root_null_atom(*this, value); + case '-': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return visitor.visit_root_number(*this, value); + default: + log_error("Document starts with a non-value character"); + return TAPE_ERROR; + } +} +template +simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept { + // Use the fact that most scalars are going to be either strings or numbers. + if(*value == '"') { + return visitor.visit_string(*this, value); + } else if (((*value - '0') < 10) || (*value == '-')) { + return visitor.visit_number(*this, value); + } + // true, false, null are uncommon. + switch (*value) { + case 't': return visitor.visit_true_atom(*this, value); + case 'f': return visitor.visit_false_atom(*this, value); + case 'n': return visitor.visit_null_atom(*this, value); + default: + log_error("Non-value found when value was expected!"); + return TAPE_ERROR; + } +} + +} // namespace stage2 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage2/logger.h b/contrib/libs/simdjson/src/generic/stage2/logger.h new file mode 100644 index 000000000000..60955495e8e1 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage2/logger.h @@ -0,0 +1,100 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include + + +// This is for an internal-only stage 2 specific logger. +// Set LOG_ENABLED = true to log what stage 2 is doing! +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace logger { + + static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"; + +#if SIMDJSON_VERBOSE_LOGGING + static constexpr const bool LOG_ENABLED = true; +#else + static constexpr const bool LOG_ENABLED = false; +#endif + static constexpr const int LOG_EVENT_LEN = 20; + static constexpr const int LOG_BUFFER_LEN = 30; + static constexpr const int LOG_SMALL_BUFFER_LEN = 10; + static constexpr const int LOG_INDEX_LEN = 5; + + static int log_depth; // Not threadsafe. Log only. + + // Helper to turn unprintable or newline characters into spaces + static simdjson_inline char printable_char(char c) { + if (c >= 0x20) { + return c; + } else { + return ' '; + } + } + + // Print the header and set up log_start + static simdjson_inline void log_start() { + if (LOG_ENABLED) { + log_depth = 0; + printf("\n"); + printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#"); + printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES); + } + } + + simdjson_unused static simdjson_inline void log_string(const char *message) { + if (LOG_ENABLED) { + printf("%s\n", message); + } + } + + // Logs a single line from the stage 2 DOM parser + template + static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) { + if (LOG_ENABLED) { + printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title); + auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1; + auto next_index = structurals.next_structural; + auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast(" "); + auto next = &structurals.buf[*next_index]; + { + // Print the next N characters in the buffer. + printf("| "); + // Otherwise, print the characters starting from the buffer position. + // Print spaces for unprintable or newline characters. + for (int i=0;i +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +// This file contains the common code every implementation uses +// It is intended to be included multiple times and compiled multiple times + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +/// @private +namespace stringparsing { + +// begin copypasta +// These chars yield themselves: " \ / +// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab +// u not handled in this table as it's complex +static const uint8_t escape_map[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0. + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4. + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5. + 0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6. + 0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7. + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +// handle a unicode codepoint +// write appropriate values into dest +// src will advance 6 bytes or 12 bytes +// dest will advance a variable amount (return via pointer) +// return true if the unicode codepoint was valid +// We work in little-endian then swap at write time +simdjson_warn_unused +simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr, + uint8_t **dst_ptr, bool allow_replacement) { + // Use the default Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD) + constexpr uint32_t substitution_code_point = 0xfffd; + // jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the + // conversion is not valid; we defer the check for this to inside the + // multilingual plane check. + uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2); + *src_ptr += 6; + + // If we found a high surrogate, we must + // check for low surrogate for characters + // outside the Basic + // Multilingual Plane. + if (code_point >= 0xd800 && code_point < 0xdc00) { + const uint8_t *src_data = *src_ptr; + /* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */ + if (((src_data[0] << 8) | src_data[1]) != ((static_cast ('\\') << 8) | static_cast ('u'))) { + if(!allow_replacement) { return false; } + code_point = substitution_code_point; + } else { + uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2); + + // We have already checked that the high surrogate is valid and + // (code_point - 0xd800) < 1024. + // + // Check that code_point_2 is in the range 0xdc00..0xdfff + // and that code_point_2 was parsed from valid hex. + uint32_t low_bit = code_point_2 - 0xdc00; + if (low_bit >> 10) { + if(!allow_replacement) { return false; } + code_point = substitution_code_point; + } else { + code_point = (((code_point - 0xd800) << 10) | low_bit) + 0x10000; + *src_ptr += 6; + } + + } + } else if (code_point >= 0xdc00 && code_point <= 0xdfff) { + // If we encounter a low surrogate (not preceded by a high surrogate) + // then we have an error. + if(!allow_replacement) { return false; } + code_point = substitution_code_point; + } + size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr); + *dst_ptr += offset; + return offset > 0; +} + + +// handle a unicode codepoint using the wobbly convention +// https://simonsapin.github.io/wtf-8/ +// write appropriate values into dest +// src will advance 6 bytes or 12 bytes +// dest will advance a variable amount (return via pointer) +// return true if the unicode codepoint was valid +// We work in little-endian then swap at write time +simdjson_warn_unused +simdjson_inline bool handle_unicode_codepoint_wobbly(const uint8_t **src_ptr, + uint8_t **dst_ptr) { + // It is not ideal that this function is nearly identical to handle_unicode_codepoint. + // + // jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the + // conversion is not valid; we defer the check for this to inside the + // multilingual plane check. + uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2); + *src_ptr += 6; + // If we found a high surrogate, we must + // check for low surrogate for characters + // outside the Basic + // Multilingual Plane. + if (code_point >= 0xd800 && code_point < 0xdc00) { + const uint8_t *src_data = *src_ptr; + /* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */ + if (((src_data[0] << 8) | src_data[1]) == ((static_cast ('\\') << 8) | static_cast ('u'))) { + uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2); + uint32_t low_bit = code_point_2 - 0xdc00; + if ((low_bit >> 10) == 0) { + code_point = + (((code_point - 0xd800) << 10) | low_bit) + 0x10000; + *src_ptr += 6; + } + } + } + + size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr); + *dst_ptr += offset; + return offset > 0; +} + + +/** + * Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There + * must be an unescaped quote terminating the string. It returns the final output + * position as pointer. In case of error (e.g., the string has bad escaped codes), + * then null_ptr is returned. It is assumed that the output buffer is large + * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes + + * SIMDJSON_PADDING bytes. + */ +simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) { + while (1) { + // Copy the next n bytes, and find the backslash and quote in them. + auto bs_quote = backslash_and_quote::copy_and_find(src, dst); + // If the next thing is the end quote, copy and return + if (bs_quote.has_quote_first()) { + // we encountered quotes first. Move dst to point to quotes and exit + return dst + bs_quote.quote_index(); + } + if (bs_quote.has_backslash()) { + /* find out where the backspace is */ + auto bs_dist = bs_quote.backslash_index(); + uint8_t escape_char = src[bs_dist + 1]; + /* we encountered backslash first. Handle backslash */ + if (escape_char == 'u') { + /* move src/dst up to the start; they will be further adjusted + within the unicode codepoint handling code. */ + src += bs_dist; + dst += bs_dist; + if (!handle_unicode_codepoint(&src, &dst, allow_replacement)) { + return nullptr; + } + } else { + /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and + * write bs_dist+1 characters to output + * note this may reach beyond the part of the buffer we've actually + * seen. I think this is ok */ + uint8_t escape_result = escape_map[escape_char]; + if (escape_result == 0u) { + return nullptr; /* bogus escape value is an error */ + } + dst[bs_dist] = escape_result; + src += bs_dist + 2; + dst += bs_dist + 1; + } + } else { + /* they are the same. Since they can't co-occur, it means we + * encountered neither. */ + src += backslash_and_quote::BYTES_PROCESSED; + dst += backslash_and_quote::BYTES_PROCESSED; + } + } +} + +simdjson_warn_unused simdjson_inline uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) { + // It is not ideal that this function is nearly identical to parse_string. + while (1) { + // Copy the next n bytes, and find the backslash and quote in them. + auto bs_quote = backslash_and_quote::copy_and_find(src, dst); + // If the next thing is the end quote, copy and return + if (bs_quote.has_quote_first()) { + // we encountered quotes first. Move dst to point to quotes and exit + return dst + bs_quote.quote_index(); + } + if (bs_quote.has_backslash()) { + /* find out where the backspace is */ + auto bs_dist = bs_quote.backslash_index(); + uint8_t escape_char = src[bs_dist + 1]; + /* we encountered backslash first. Handle backslash */ + if (escape_char == 'u') { + /* move src/dst up to the start; they will be further adjusted + within the unicode codepoint handling code. */ + src += bs_dist; + dst += bs_dist; + if (!handle_unicode_codepoint_wobbly(&src, &dst)) { + return nullptr; + } + } else { + /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and + * write bs_dist+1 characters to output + * note this may reach beyond the part of the buffer we've actually + * seen. I think this is ok */ + uint8_t escape_result = escape_map[escape_char]; + if (escape_result == 0u) { + return nullptr; /* bogus escape value is an error */ + } + dst[bs_dist] = escape_result; + src += bs_dist + 2; + dst += bs_dist + 1; + } + } else { + /* they are the same. Since they can't co-occur, it means we + * encountered neither. */ + src += backslash_and_quote::BYTES_PROCESSED; + dst += backslash_and_quote::BYTES_PROCESSED; + } + } +} + +} // namespace stringparsing +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage2/structural_iterator.h b/contrib/libs/simdjson/src/generic/stage2/structural_iterator.h new file mode 100644 index 000000000000..3f5ec4ff41d3 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage2/structural_iterator.h @@ -0,0 +1,64 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage2 { + +class structural_iterator { +public: + const uint8_t* const buf; + uint32_t *next_structural; + dom_parser_implementation &dom_parser; + + // Start a structural + simdjson_inline structural_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index) + : buf{_dom_parser.buf}, + next_structural{&_dom_parser.structural_indexes[start_structural_index]}, + dom_parser{_dom_parser} { + } + // Get the buffer position of the current structural character + simdjson_inline const uint8_t* current() { + return &buf[*(next_structural-1)]; + } + // Get the current structural character + simdjson_inline char current_char() { + return buf[*(next_structural-1)]; + } + // Get the next structural character without advancing + simdjson_inline char peek_next_char() { + return buf[*next_structural]; + } + simdjson_inline const uint8_t* peek() { + return &buf[*next_structural]; + } + simdjson_inline const uint8_t* advance() { + return &buf[*(next_structural++)]; + } + simdjson_inline char advance_char() { + return buf[*(next_structural++)]; + } + simdjson_inline size_t remaining_len() { + return dom_parser.len - *(next_structural-1); + } + + simdjson_inline bool at_end() { + return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes]; + } + simdjson_inline bool at_beginning() { + return next_structural == dom_parser.structural_indexes.get(); + } +}; + +} // namespace stage2 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage2/tape_builder.h b/contrib/libs/simdjson/src/generic/stage2/tape_builder.h new file mode 100644 index 000000000000..52931010fc01 --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage2/tape_builder.h @@ -0,0 +1,297 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H +#include +#include +#include +#include +#include +#include +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage2 { + +struct tape_builder { + template + simdjson_warn_unused static simdjson_inline error_code parse_document( + dom_parser_implementation &dom_parser, + dom::document &doc) noexcept; + + /** Called when a non-empty document starts. */ + simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept; + /** Called when a non-empty document ends without error. */ + simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept; + + /** Called when a non-empty array starts. */ + simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept; + /** Called when a non-empty array ends. */ + simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept; + /** Called when an empty array is found. */ + simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept; + + /** Called when a non-empty object starts. */ + simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept; + /** + * Called when a key in a field is encountered. + * + * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array + * will be called after this with the field value. + */ + simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept; + /** Called when a non-empty object ends. */ + simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept; + /** Called when an empty object is found. */ + simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept; + + /** + * Called when a string, number, boolean or null is found. + */ + simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept; + /** + * Called when a string, number, boolean or null is found at the top level of a document (i.e. + * when there is no array or object and the entire document is a single string, number, boolean or + * null. + * + * This is separate from primitive() because simdjson's normal primitive parsing routines assume + * there is at least one more token after the value, which is only true in an array or object. + */ + simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept; + + simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept; + simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept; + simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept; + simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept; + simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept; + + simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept; + simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept; + simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept; + simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept; + simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept; + + /** Called each time a new field or element in an array or object is found. */ + simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept; + + /** Next location to write to tape */ + tape_writer tape; +private: + /** Next write location in the string buf for stage 2 parsing */ + uint8_t *current_string_buf_loc; + + simdjson_inline tape_builder(dom::document &doc) noexcept; + + simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept; + simdjson_inline void start_container(json_iterator &iter) noexcept; + simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept; + simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept; + simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept; + simdjson_inline void on_end_string(uint8_t *dst) noexcept; +}; // struct tape_builder + +template +simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document( + dom_parser_implementation &dom_parser, + dom::document &doc) noexcept { + dom_parser.doc = &doc; + json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); + tape_builder builder(doc); + return iter.walk_document(builder); +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept { + return iter.visit_root_primitive(*this, value); +} +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept { + return iter.visit_primitive(*this, value); +} +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept { + return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); +} +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept { + return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept { + start_container(iter); + return SUCCESS; +} +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept { + start_container(iter); + return SUCCESS; +} +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept { + start_container(iter); + return SUCCESS; +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept { + return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); +} +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept { + return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); +} +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept { + constexpr uint32_t start_tape_index = 0; + tape.append(start_tape_index, internal::tape_type::ROOT); + tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT); + return SUCCESS; +} +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept { + return visit_string(iter, key, true); +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept { + iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1 + return SUCCESS; +} + +simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept { + iter.log_value(key ? "key" : "string"); + uint8_t *dst = on_start_string(iter); + dst = stringparsing::parse_string(value+1, dst, false); // We do not allow replacement when the escape characters are invalid. + if (dst == nullptr) { + iter.log_error("Invalid escape in string"); + return STRING_ERROR; + } + on_end_string(dst); + return SUCCESS; +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept { + return visit_string(iter, value); +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept { + iter.log_value("number"); + return numberparsing::parse_number(value, tape); +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept { + // + // We need to make a copy to make sure that the string is space terminated. + // This is not about padding the input, which should already padded up + // to len + SIMDJSON_PADDING. However, we have no control at this stage + // on how the padding was done. What if the input string was padded with nulls? + // It is quite common for an input string to have an extra null character (C string). + // We do not want to allow 9\0 (where \0 is the null character) inside a JSON + // document, but the string "9\0" by itself is fine. So we make a copy and + // pad the input with spaces when we know that there is just one input element. + // This copy is relatively expensive, but it will almost never be called in + // practice unless you are in the strange scenario where you have many JSON + // documents made of single atoms. + // + std::unique_ptrcopy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]); + if (copy.get() == nullptr) { return MEMALLOC; } + std::memcpy(copy.get(), value, iter.remaining_len()); + std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING); + error_code error = visit_number(iter, copy.get()); + return error; +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept { + iter.log_value("true"); + if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; } + tape.append(0, internal::tape_type::TRUE_VALUE); + return SUCCESS; +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept { + iter.log_value("true"); + if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; } + tape.append(0, internal::tape_type::TRUE_VALUE); + return SUCCESS; +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept { + iter.log_value("false"); + if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; } + tape.append(0, internal::tape_type::FALSE_VALUE); + return SUCCESS; +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept { + iter.log_value("false"); + if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; } + tape.append(0, internal::tape_type::FALSE_VALUE); + return SUCCESS; +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept { + iter.log_value("null"); + if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; } + tape.append(0, internal::tape_type::NULL_VALUE); + return SUCCESS; +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept { + iter.log_value("null"); + if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; } + tape.append(0, internal::tape_type::NULL_VALUE); + return SUCCESS; +} + +// private: + +simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept { + return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get()); +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept { + auto start_index = next_tape_index(iter); + tape.append(start_index+2, start); + tape.append(start_index, end); + return SUCCESS; +} + +simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept { + iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter); + iter.dom_parser.open_containers[iter.depth].count = 0; + tape.skip(); // We don't actually *write* the start element until the end. +} + +simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept { + // Write the ending tape element, pointing at the start location + const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index; + tape.append(start_tape_index, end); + // Write the start tape element, pointing at the end location (and including count) + // count can overflow if it exceeds 24 bits... so we saturate + // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). + const uint32_t count = iter.dom_parser.open_containers[iter.depth].count; + const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; + tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start); + return SUCCESS; +} + +simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept { + // we advance the point, accounting for the fact that we have a NULL termination + tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING); + return current_string_buf_loc + sizeof(uint32_t); +} + +simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept { + uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); + // TODO check for overflow in case someone has a crazy string (>=4GB?) + // But only add the overflow check when the document itself exceeds 4GB + // Currently unneeded because we refuse to parse docs larger or equal to 4GB. + memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t)); + // NULL termination is still handy if you expect all your strings to + // be NULL terminated? It comes at a small cost + *dst = 0; + current_string_buf_loc = dst + 1; +} + +} // namespace stage2 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/generic/stage2/tape_writer.h b/contrib/libs/simdjson/src/generic/stage2/tape_writer.h new file mode 100644 index 000000000000..947aa6d0965c --- /dev/null +++ b/contrib/libs/simdjson/src/generic/stage2/tape_writer.h @@ -0,0 +1,117 @@ +#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H +#include +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { +namespace { +namespace stage2 { + +struct tape_writer { + /** The next place to write to tape */ + uint64_t *next_tape_loc; + + /** Write a signed 64-bit value to tape. */ + simdjson_inline void append_s64(int64_t value) noexcept; + + /** Write an unsigned 64-bit value to tape. */ + simdjson_inline void append_u64(uint64_t value) noexcept; + + /** Write a double value to tape. */ + simdjson_inline void append_double(double value) noexcept; + + /** + * Append a tape entry (an 8-bit type,and 56 bits worth of value). + */ + simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept; + + /** + * Skip the current tape entry without writing. + * + * Used to skip the start of the container, since we'll come back later to fill it in when the + * container ends. + */ + simdjson_inline void skip() noexcept; + + /** + * Skip the number of tape entries necessary to write a large u64 or i64. + */ + simdjson_inline void skip_large_integer() noexcept; + + /** + * Skip the number of tape entries necessary to write a double. + */ + simdjson_inline void skip_double() noexcept; + + /** + * Write a value to a known location on tape. + * + * Used to go back and write out the start of a container after the container ends. + */ + simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept; + +private: + /** + * Append both the tape entry, and a supplementary value following it. Used for types that need + * all 64 bits, such as double and uint64_t. + */ + template + simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept; +}; // struct tape_writer + +simdjson_inline void tape_writer::append_s64(int64_t value) noexcept { + append2(0, value, internal::tape_type::INT64); +} + +simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept { + append(0, internal::tape_type::UINT64); + *next_tape_loc = value; + next_tape_loc++; +} + +/** Write a double value to tape. */ +simdjson_inline void tape_writer::append_double(double value) noexcept { + append2(0, value, internal::tape_type::DOUBLE); +} + +simdjson_inline void tape_writer::skip() noexcept { + next_tape_loc++; +} + +simdjson_inline void tape_writer::skip_large_integer() noexcept { + next_tape_loc += 2; +} + +simdjson_inline void tape_writer::skip_double() noexcept { + next_tape_loc += 2; +} + +simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept { + *next_tape_loc = val | ((uint64_t(char(t))) << 56); + next_tape_loc++; +} + +template +simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept { + append(val, t); + static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!"); + memcpy(next_tape_loc, &val2, sizeof(val2)); + next_tape_loc++; +} + +simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept { + tape_loc = val | ((uint64_t(char(t))) << 56); +} + +} // namespace stage2 +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H \ No newline at end of file diff --git a/contrib/libs/simdjson/src/haswell.cpp b/contrib/libs/simdjson/src/haswell.cpp new file mode 100644 index 000000000000..f721cac8b956 --- /dev/null +++ b/contrib/libs/simdjson/src/haswell.cpp @@ -0,0 +1,169 @@ +#ifndef SIMDJSON_SRC_HASWELL_CPP +#define SIMDJSON_SRC_HASWELL_CPP + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include +#include + +#include +#include +#include +#include + +// +// Stage 1 +// + +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { + +simdjson_warn_unused error_code implementation::create_dom_parser_implementation( + size_t capacity, + size_t max_depth, + std::unique_ptr& dst +) const noexcept { + dst.reset( new (std::nothrow) dom_parser_implementation() ); + if (!dst) { return MEMALLOC; } + if (auto err = dst->set_capacity(capacity)) + return err; + if (auto err = dst->set_max_depth(max_depth)) + return err; + return SUCCESS; +} + +namespace { + +using namespace simd; + +// This identifies structural characters (comma, colon, braces, brackets), +// and ASCII white-space ('\r','\n','\t',' '). +simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64& in) { + // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why + // we can't use the generic lookup_16. + const auto whitespace_table = simd8::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); + + // The 6 operators (:,[]{}) have these values: + // + // , 2C + // : 3A + // [ 5B + // { 7B + // ] 5D + // } 7D + // + // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique. + // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then + // match it (against | 0x20). + // + // To prevent recognizing other characters, everything else gets compared with 0, which cannot + // match due to the | 0x20. + // + // NOTE: Due to the | 0x20, this ALSO treats and (control characters 0C and 1A) like , + // and :. This gets caught in stage 2, which checks the actual character to ensure the right + // operators are in the right places. + const auto op_table = simd8::repeat_16( + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B + ',', '}', 0, 0 // , = 2C, ] = 5D, } = 7D + ); + + // We compute whitespace and op separately. If later code only uses one or the + // other, given the fact that all functions are aggressively inlined, we can + // hope that useless computations will be omitted. This is namely case when + // minifying (we only need whitespace). + + const uint64_t whitespace = in.eq({ + _mm256_shuffle_epi8(whitespace_table, in.chunks[0]), + _mm256_shuffle_epi8(whitespace_table, in.chunks[1]) + }); + // Turn [ and ] into { and } + const simd8x64 curlified{ + in.chunks[0] | 0x20, + in.chunks[1] | 0x20 + }; + const uint64_t op = curlified.eq({ + _mm256_shuffle_epi8(op_table, in.chunks[0]), + _mm256_shuffle_epi8(op_table, in.chunks[1]) + }); + + return { whitespace, op }; +} + +simdjson_inline bool is_ascii(const simd8x64& input) { + return input.reduce_or().is_ascii(); +} + +simdjson_unused simdjson_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0 + simd8 is_third_byte = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); +} + +simdjson_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80 + simd8 is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80 + return is_third_byte | is_fourth_byte; +} + +} // unnamed namespace +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +// +// Stage 2 +// + +// +// Implementation-specific overrides +// +namespace simdjson { +namespace SIMDJSON_IMPLEMENTATION { + +simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept { + return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept { + this->buf = _buf; + this->len = _len; + return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming); +} + +simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return haswell::stage1::generic_validate_utf8(buf,len); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { + return stage2::tape_builder::parse_document(*this, _doc); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { + return stage2::tape_builder::parse_document(*this, _doc); +} + +simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept { + return haswell::stringparsing::parse_string(src, dst, replacement_char); +} + +simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept { + return haswell::stringparsing::parse_wobbly_string(src, dst); +} + +simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { + auto error = stage1(_buf, _len, stage1_mode::regular); + if (error) { return error; } + return stage2(_doc); +} + +} // namespace SIMDJSON_IMPLEMENTATION +} // namespace simdjson + +#include + +#endif // SIMDJSON_SRC_HASWELL_CPP \ No newline at end of file diff --git a/contrib/libs/simdjson/src/icelake.cpp b/contrib/libs/simdjson/src/icelake.cpp new file mode 100644 index 000000000000..8ec08c69ccf4 --- /dev/null +++ b/contrib/libs/simdjson/src/icelake.cpp @@ -0,0 +1,215 @@ +#ifndef SIMDJSON_SRC_ICELAKE_CPP +#define SIMDJSON_SRC_ICELAKE_CPP + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include +#include + +// defining SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER allows us to provide our own bit_indexer::write +#define SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER + +#include +#include +#include +#include + +#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER + +// +// Stage 1 +// + +namespace simdjson { +namespace icelake { + +simdjson_warn_unused error_code implementation::create_dom_parser_implementation( + size_t capacity, + size_t max_depth, + std::unique_ptr& dst +) const noexcept { + dst.reset( new (std::nothrow) dom_parser_implementation() ); + if (!dst) { return MEMALLOC; } + if (auto err = dst->set_capacity(capacity)) + return err; + if (auto err = dst->set_max_depth(max_depth)) + return err; + return SUCCESS; +} + +namespace { + +using namespace simd; + +// This identifies structural characters (comma, colon, braces, brackets), +// and ASCII white-space ('\r','\n','\t',' '). +simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64& in) { + // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why + // we can't use the generic lookup_16. + const auto whitespace_table = simd8::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); + + // The 6 operators (:,[]{}) have these values: + // + // , 2C + // : 3A + // [ 5B + // { 7B + // ] 5D + // } 7D + // + // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique. + // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then + // match it (against | 0x20). + // + // To prevent recognizing other characters, everything else gets compared with 0, which cannot + // match due to the | 0x20. + // + // NOTE: Due to the | 0x20, this ALSO treats and (control characters 0C and 1A) like , + // and :. This gets caught in stage 2, which checks the actual character to ensure the right + // operators are in the right places. + const auto op_table = simd8::repeat_16( + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B + ',', '}', 0, 0 // , = 2C, ] = 5D, } = 7D + ); + + // We compute whitespace and op separately. If later code only uses one or the + // other, given the fact that all functions are aggressively inlined, we can + // hope that useless computations will be omitted. This is namely case when + // minifying (we only need whitespace). + + const uint64_t whitespace = in.eq({ + _mm512_shuffle_epi8(whitespace_table, in.chunks[0]) + }); + // Turn [ and ] into { and } + const simd8x64 curlified{ + in.chunks[0] | 0x20 + }; + const uint64_t op = curlified.eq({ + _mm512_shuffle_epi8(op_table, in.chunks[0]) + }); + + return { whitespace, op }; +} + +simdjson_inline bool is_ascii(const simd8x64& input) { + return input.reduce_or().is_ascii(); +} + +simdjson_unused simdjson_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0 + simd8 is_third_byte = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); +} + +simdjson_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80 + simd8 is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80 + return is_third_byte | is_fourth_byte; +} + +} // unnamed namespace +} // namespace icelake +} // namespace simdjson + +/** + * We provide a custom version of bit_indexer::write using + * naked intrinsics. + * TODO: make this code more elegant. + */ +// Under GCC 12, the intrinsic _mm512_extracti32x4_epi32 may generate 'maybe uninitialized'. +// as a workaround, we disable warnings within the following function. +SIMDJSON_PUSH_DISABLE_ALL_WARNINGS +namespace simdjson { namespace icelake { namespace { namespace stage1 { +simdjson_inline void bit_indexer::write(uint32_t idx, uint64_t bits) { + // In some instances, the next branch is expensive because it is mispredicted. + // Unfortunately, in other cases, + // it helps tremendously. + if (bits == 0) { return; } + + const __m512i indexes = _mm512_maskz_compress_epi8(bits, _mm512_set_epi32( + 0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130, + 0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120, + 0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110, + 0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100 + )); + const __m512i start_index = _mm512_set1_epi32(idx); + + const auto count = count_ones(bits); + __m512i t0 = _mm512_cvtepu8_epi32(_mm512_castsi512_si128(indexes)); + _mm512_storeu_si512(this->tail, _mm512_add_epi32(t0, start_index)); + + if(count > 16) { + const __m512i t1 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 1)); + _mm512_storeu_si512(this->tail + 16, _mm512_add_epi32(t1, start_index)); + if(count > 32) { + const __m512i t2 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 2)); + _mm512_storeu_si512(this->tail + 32, _mm512_add_epi32(t2, start_index)); + if(count > 48) { + const __m512i t3 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 3)); + _mm512_storeu_si512(this->tail + 48, _mm512_add_epi32(t3, start_index)); + } + } + } + this->tail += count; +} +}}}} +SIMDJSON_POP_DISABLE_WARNINGS + +// +// Stage 2 +// + +// +// Implementation-specific overrides +// +namespace simdjson { +namespace icelake { + +simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept { + return icelake::stage1::json_minifier::minify<128>(buf, len, dst, dst_len); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept { + this->buf = _buf; + this->len = _len; + return icelake::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming); +} + +simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return icelake::stage1::generic_validate_utf8(buf,len); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { + return stage2::tape_builder::parse_document(*this, _doc); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { + return stage2::tape_builder::parse_document(*this, _doc); +} + +simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept { + return icelake::stringparsing::parse_string(src, dst, replacement_char); +} + +simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept { + return icelake::stringparsing::parse_wobbly_string(src, dst); +} + +simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { + auto error = stage1(_buf, _len, stage1_mode::regular); + if (error) { return error; } + return stage2(_doc); +} + +} // namespace icelake +} // namespace simdjson + +#include + +#endif // SIMDJSON_SRC_ICELAKE_CPP diff --git a/contrib/libs/simdjson/src/implementation.cpp b/contrib/libs/simdjson/src/implementation.cpp new file mode 100644 index 000000000000..4323e76bfebf --- /dev/null +++ b/contrib/libs/simdjson/src/implementation.cpp @@ -0,0 +1,330 @@ +#ifndef SIMDJSON_SRC_IMPLEMENTATION_CPP +#define SIMDJSON_SRC_IMPLEMENTATION_CPP + +#include +#include +#include +#include + +#include +#include + +namespace simdjson { + +bool implementation::supported_by_runtime_system() const { + uint32_t required_instruction_sets = this->required_instruction_sets(); + uint32_t supported_instruction_sets = internal::detect_supported_architectures(); + return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets); +} + +} // namespace simdjson + +#define SIMDJSON_CONDITIONAL_INCLUDE + +#if SIMDJSON_IMPLEMENTATION_ARM64 +#include +namespace simdjson { +namespace internal { +static const arm64::implementation* get_arm64_singleton() { + static const arm64::implementation arm64_singleton{}; + return &arm64_singleton; +} +} // namespace internal +} // namespace simdjson +#endif // SIMDJSON_IMPLEMENTATION_ARM64 + +#if SIMDJSON_IMPLEMENTATION_FALLBACK +#include +namespace simdjson { +namespace internal { +static const fallback::implementation* get_fallback_singleton() { + static const fallback::implementation fallback_singleton{}; + return &fallback_singleton; +} +} // namespace internal +} // namespace simdjson +#endif // SIMDJSON_IMPLEMENTATION_FALLBACK + + +#if SIMDJSON_IMPLEMENTATION_HASWELL +#include +namespace simdjson { +namespace internal { +static const haswell::implementation* get_haswell_singleton() { + static const haswell::implementation haswell_singleton{}; + return &haswell_singleton; +} +} // namespace internal +} // namespace simdjson +#endif + +#if SIMDJSON_IMPLEMENTATION_ICELAKE +#include +namespace simdjson { +namespace internal { +static const icelake::implementation* get_icelake_singleton() { + static const icelake::implementation icelake_singleton{}; + return &icelake_singleton; +} +} // namespace internal +} // namespace simdjson +#endif + +#if SIMDJSON_IMPLEMENTATION_PPC64 +#error #include +namespace simdjson { +namespace internal { +static const ppc64::implementation* get_ppc64_singleton() { + static const ppc64::implementation ppc64_singleton{}; + return &ppc64_singleton; +} +} // namespace internal +} // namespace simdjson +#endif // SIMDJSON_IMPLEMENTATION_PPC64 + +#if SIMDJSON_IMPLEMENTATION_WESTMERE +#include +namespace simdjson { +namespace internal { +static const simdjson::westmere::implementation* get_westmere_singleton() { + static const simdjson::westmere::implementation westmere_singleton{}; + return &westmere_singleton; +} +} // namespace internal +} // namespace simdjson +#endif // SIMDJSON_IMPLEMENTATION_WESTMERE + +#if SIMDJSON_IMPLEMENTATION_LSX +#include +namespace simdjson { +namespace internal { +static const simdjson::lsx::implementation* get_lsx_singleton() { + static const simdjson::lsx::implementation lsx_singleton{}; + return &lsx_singleton; +} +} // namespace internal +} // namespace simdjson +#endif // SIMDJSON_IMPLEMENTATION_LSX + +#if SIMDJSON_IMPLEMENTATION_LASX +#include +namespace simdjson { +namespace internal { +static const simdjson::lasx::implementation* get_lasx_singleton() { + static const simdjson::lasx::implementation lasx_singleton{}; + return &lasx_singleton; +} +} // namespace internal +} // namespace simdjson +#endif // SIMDJSON_IMPLEMENTATION_LASX + +#undef SIMDJSON_CONDITIONAL_INCLUDE + +namespace simdjson { +namespace internal { + +// When there is a single implementation, we should not pay a price +// for dispatching to the best implementation. We should just use the +// one we have. This is a compile-time check. +#define SIMDJSON_SINGLE_IMPLEMENTATION (SIMDJSON_IMPLEMENTATION_ICELAKE \ + + SIMDJSON_IMPLEMENTATION_HASWELL + SIMDJSON_IMPLEMENTATION_WESTMERE \ + + SIMDJSON_IMPLEMENTATION_ARM64 + SIMDJSON_IMPLEMENTATION_PPC64 \ + + SIMDJSON_IMPLEMENTATION_LSX + SIMDJSON_IMPLEMENTATION_LASX \ + + SIMDJSON_IMPLEMENTATION_FALLBACK == 1) + +#if SIMDJSON_SINGLE_IMPLEMENTATION + static const implementation* get_single_implementation() { + return +#if SIMDJSON_IMPLEMENTATION_ICELAKE + get_icelake_singleton(); +#endif +#if SIMDJSON_IMPLEMENTATION_HASWELL + get_haswell_singleton(); +#endif +#if SIMDJSON_IMPLEMENTATION_WESTMERE + get_westmere_singleton(); +#endif +#if SIMDJSON_IMPLEMENTATION_ARM64 + get_arm64_singleton(); +#endif +#if SIMDJSON_IMPLEMENTATION_PPC64 + get_ppc64_singleton(); +#endif +#if SIMDJSON_IMPLEMENTATION_LSX + get_lsx_singleton(); +#endif +#if SIMDJSON_IMPLEMENTATION_LASX + get_lasx_singleton(); +#endif +#if SIMDJSON_IMPLEMENTATION_FALLBACK + get_fallback_singleton(); +#endif +} +#endif + +// Static array of known implementations. We're hoping these get baked into the executable +// without requiring a static initializer. + +/** + * @private Detects best supported implementation on first use, and sets it + */ +class detect_best_supported_implementation_on_first_use final : public implementation { +public: + std::string name() const noexcept final { return set_best()->name(); } + std::string description() const noexcept final { return set_best()->description(); } + uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); } + simdjson_warn_unused error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final { + return set_best()->create_dom_parser_implementation(capacity, max_length, dst); + } + simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final { + return set_best()->minify(buf, len, dst, dst_len); + } + simdjson_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override { + return set_best()->validate_utf8(buf, len); + } + simdjson_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {} +private: + const implementation *set_best() const noexcept; +}; + +static_assert(std::is_trivially_destructible::value, "detect_best_supported_implementation_on_first_use should be trivially destructible"); + +static const std::initializer_list& get_available_implementation_pointers() { + static const std::initializer_list available_implementation_pointers { +#if SIMDJSON_IMPLEMENTATION_ICELAKE + get_icelake_singleton(), +#endif +#if SIMDJSON_IMPLEMENTATION_HASWELL + get_haswell_singleton(), +#endif +#if SIMDJSON_IMPLEMENTATION_WESTMERE + get_westmere_singleton(), +#endif +#if SIMDJSON_IMPLEMENTATION_ARM64 + get_arm64_singleton(), +#endif +#if SIMDJSON_IMPLEMENTATION_PPC64 + get_ppc64_singleton(), +#endif +#if SIMDJSON_IMPLEMENTATION_LSX + get_lsx_singleton(), +#endif +#if SIMDJSON_IMPLEMENTATION_LASX + get_lasx_singleton(), +#endif +#if SIMDJSON_IMPLEMENTATION_FALLBACK + get_fallback_singleton(), +#endif + }; // available_implementation_pointers + return available_implementation_pointers; +} + +// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support +class unsupported_implementation final : public implementation { +public: + simdjson_warn_unused error_code create_dom_parser_implementation( + size_t, + size_t, + std::unique_ptr& + ) const noexcept final { + return UNSUPPORTED_ARCHITECTURE; + } + simdjson_warn_unused error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override { + return UNSUPPORTED_ARCHITECTURE; + } + simdjson_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override { + return false; // Just refuse to validate. Given that we have a fallback implementation + // it seems unlikely that unsupported_implementation will ever be used. If it is used, + // then it will flag all strings as invalid. The alternative is to return an error_code + // from which the user has to figure out whether the string is valid UTF-8... which seems + // like a lot of work just to handle the very unlikely case that we have an unsupported + // implementation. And, when it does happen (that we have an unsupported implementation), + // what are the chances that the programmer has a fallback? Given that *we* provide the + // fallback, it implies that the programmer would need a fallback for our fallback. + } + unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {} +}; + +static_assert(std::is_trivially_destructible::value, "unsupported_singleton should be trivially destructible"); + +const unsupported_implementation* get_unsupported_singleton() { + static const unsupported_implementation unsupported_singleton{}; + return &unsupported_singleton; +} + +size_t available_implementation_list::size() const noexcept { + return internal::get_available_implementation_pointers().size(); +} +const implementation * const *available_implementation_list::begin() const noexcept { + return internal::get_available_implementation_pointers().begin(); +} +const implementation * const *available_implementation_list::end() const noexcept { + return internal::get_available_implementation_pointers().end(); +} +const implementation *available_implementation_list::detect_best_supported() const noexcept { + // They are prelisted in priority order, so we just go down the list + uint32_t supported_instruction_sets = internal::detect_supported_architectures(); + for (const implementation *impl : internal::get_available_implementation_pointers()) { + uint32_t required_instruction_sets = impl->required_instruction_sets(); + if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; } + } + return get_unsupported_singleton(); // this should never happen? +} + +const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept { + SIMDJSON_PUSH_DISABLE_WARNINGS + SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe + char *force_implementation_name = getenv("SIMDJSON_FORCE_IMPLEMENTATION"); + SIMDJSON_POP_DISABLE_WARNINGS + + if (force_implementation_name) { + auto force_implementation = get_available_implementations()[force_implementation_name]; + if (force_implementation) { + return get_active_implementation() = force_implementation; + } else { + // Note: abort() and stderr usage within the library is forbidden. + return get_active_implementation() = get_unsupported_singleton(); + } + } + return get_active_implementation() = get_available_implementations().detect_best_supported(); +} + +} // namespace internal + +SIMDJSON_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() { + static const internal::available_implementation_list available_implementations{}; + return available_implementations; +} + +SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr& get_active_implementation() { +#if SIMDJSON_SINGLE_IMPLEMENTATION + // We immediately select the only implementation we have, skipping the + // detect_best_supported_implementation_on_first_use_singleton. + static internal::atomic_ptr active_implementation{internal::get_single_implementation()}; + return active_implementation; +#else + static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton; + static internal::atomic_ptr active_implementation{&detect_best_supported_implementation_on_first_use_singleton}; + return active_implementation; +#endif +} + +simdjson_warn_unused error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept { + return get_active_implementation()->minify(reinterpret_cast(buf), len, reinterpret_cast(dst), dst_len); +} +simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept { + return get_active_implementation()->validate_utf8(buf, len); +} +const implementation * builtin_implementation() { + static const implementation * builtin_impl = get_available_implementations()[SIMDJSON_STRINGIFY(SIMDJSON_BUILTIN_IMPLEMENTATION)]; + assert(builtin_impl); + return builtin_impl; +} + +} // namespace simdjson + +#endif // SIMDJSON_SRC_IMPLEMENTATION_CPP diff --git a/contrib/libs/simdjson/src/internal/error_tables.cpp b/contrib/libs/simdjson/src/internal/error_tables.cpp new file mode 100644 index 000000000000..43499bbab09f --- /dev/null +++ b/contrib/libs/simdjson/src/internal/error_tables.cpp @@ -0,0 +1,48 @@ +#ifndef SIMDJSON_SRC_ERROR_TABLES_CPP +#define SIMDJSON_SRC_ERROR_TABLES_CPP + +#include +#include + +namespace simdjson { +namespace internal { + + SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[] { + { SUCCESS, "SUCCESS: No error" }, + { CAPACITY, "CAPACITY: This parser can't support a document that big" }, + { MEMALLOC, "MEMALLOC: Error allocating memory, we're most likely out of memory" }, + { TAPE_ERROR, "TAPE_ERROR: The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc." }, + { DEPTH_ERROR, "DEPTH_ERROR: The JSON document was too deep (too many nested objects and arrays)" }, + { STRING_ERROR, "STRING_ERROR: Problem while parsing a string" }, + { T_ATOM_ERROR, "T_ATOM_ERROR: Problem while parsing an atom starting with the letter 't'" }, + { F_ATOM_ERROR, "F_ATOM_ERROR: Problem while parsing an atom starting with the letter 'f'" }, + { N_ATOM_ERROR, "N_ATOM_ERROR: Problem while parsing an atom starting with the letter 'n'" }, + { NUMBER_ERROR, "NUMBER_ERROR: Problem while parsing a number" }, + { BIGINT_ERROR, "BIGINT_ERROR: Big integer value that cannot be represented using 64 bits" }, + { UTF8_ERROR, "UTF8_ERROR: The input is not valid UTF-8" }, + { UNINITIALIZED, "UNINITIALIZED: Uninitialized" }, + { EMPTY, "EMPTY: no JSON found" }, + { UNESCAPED_CHARS, "UNESCAPED_CHARS: Within strings, some characters must be escaped, we found unescaped characters" }, + { UNCLOSED_STRING, "UNCLOSED_STRING: A string is opened, but never closed." }, + { UNSUPPORTED_ARCHITECTURE, "UNSUPPORTED_ARCHITECTURE: simdjson does not have an implementation supported by this CPU architecture. Please report this error to the core team as it should never happen." }, + { INCORRECT_TYPE, "INCORRECT_TYPE: The JSON element does not have the requested type." }, + { NUMBER_OUT_OF_RANGE, "NUMBER_OUT_OF_RANGE: The JSON number is too large or too small to fit within the requested type." }, + { INDEX_OUT_OF_BOUNDS, "INDEX_OUT_OF_BOUNDS: Attempted to access an element of a JSON array that is beyond its length." }, + { NO_SUCH_FIELD, "NO_SUCH_FIELD: The JSON field referenced does not exist in this object." }, + { IO_ERROR, "IO_ERROR: Error reading the file." }, + { INVALID_JSON_POINTER, "INVALID_JSON_POINTER: Invalid JSON pointer syntax." }, + { INVALID_URI_FRAGMENT, "INVALID_URI_FRAGMENT: Invalid URI fragment syntax." }, + { UNEXPECTED_ERROR, "UNEXPECTED_ERROR: Unexpected error, consider reporting this problem as you may have found a bug in simdjson" }, + { PARSER_IN_USE, "PARSER_IN_USE: Cannot parse a new document while a document is still in use." }, + { OUT_OF_ORDER_ITERATION, "OUT_OF_ORDER_ITERATION: Objects and arrays can only be iterated when they are first encountered." }, + { INSUFFICIENT_PADDING, "INSUFFICIENT_PADDING: simdjson requires the input JSON string to have at least SIMDJSON_PADDING extra bytes allocated, beyond the string's length. Consider using the simdjson::padded_string class if needed." }, + { INCOMPLETE_ARRAY_OR_OBJECT, "INCOMPLETE_ARRAY_OR_OBJECT: JSON document ended early in the middle of an object or array." }, + { SCALAR_DOCUMENT_AS_VALUE, "SCALAR_DOCUMENT_AS_VALUE: A JSON document made of a scalar (number, Boolean, null or string) is treated as a value. Use get_bool(), get_double(), etc. on the document instead. "}, + { OUT_OF_BOUNDS, "OUT_OF_BOUNDS: Attempt to access location outside of document."}, + { TRAILING_CONTENT, "TRAILING_CONTENT: Unexpected trailing content in the JSON input."} + }; // error_messages[] + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_SRC_ERROR_TABLES_CPP \ No newline at end of file diff --git a/contrib/libs/simdjson/src/internal/isadetection.h b/contrib/libs/simdjson/src/internal/isadetection.h new file mode 100644 index 000000000000..c873f7b74ecb --- /dev/null +++ b/contrib/libs/simdjson/src/internal/isadetection.h @@ -0,0 +1,247 @@ +/* From +https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h +Highly modified. + +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, +Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute +(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, +Samy Bengio, Johnny Mariethoz) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories +America and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef SIMDJSON_INTERNAL_ISADETECTION_H +#define SIMDJSON_INTERNAL_ISADETECTION_H + +#include "simdjson/internal/instruction_set.h" + +#include +#include +#if defined(_MSC_VER) +#include +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) +#include +#endif + +namespace simdjson { +namespace internal { + +#if defined(__PPC64__) + +static inline uint32_t detect_supported_architectures() { + return instruction_set::ALTIVEC; +} + +#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) + +static inline uint32_t detect_supported_architectures() { + return instruction_set::NEON; +} + +#elif defined(__x86_64__) || defined(_M_AMD64) // x64 + + +namespace { +// Can be found on Intel ISA Reference for CPUID +constexpr uint32_t cpuid_avx2_bit = 1 << 5; ///< @private Bit 5 of EBX for EAX=0x7 +constexpr uint32_t cpuid_bmi1_bit = 1 << 3; ///< @private bit 3 of EBX for EAX=0x7 +constexpr uint32_t cpuid_bmi2_bit = 1 << 8; ///< @private bit 8 of EBX for EAX=0x7 +constexpr uint32_t cpuid_avx512f_bit = 1 << 16; ///< @private bit 16 of EBX for EAX=0x7 +constexpr uint32_t cpuid_avx512dq_bit = 1 << 17; ///< @private bit 17 of EBX for EAX=0x7 +constexpr uint32_t cpuid_avx512ifma_bit = 1 << 21; ///< @private bit 21 of EBX for EAX=0x7 +constexpr uint32_t cpuid_avx512pf_bit = 1 << 26; ///< @private bit 26 of EBX for EAX=0x7 +constexpr uint32_t cpuid_avx512er_bit = 1 << 27; ///< @private bit 27 of EBX for EAX=0x7 +constexpr uint32_t cpuid_avx512cd_bit = 1 << 28; ///< @private bit 28 of EBX for EAX=0x7 +constexpr uint32_t cpuid_avx512bw_bit = 1 << 30; ///< @private bit 30 of EBX for EAX=0x7 +constexpr uint32_t cpuid_avx512vl_bit = 1U << 31; ///< @private bit 31 of EBX for EAX=0x7 +constexpr uint32_t cpuid_avx512vbmi2_bit = 1 << 6; ///< @private bit 6 of ECX for EAX=0x7 +constexpr uint64_t cpuid_avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX +constexpr uint64_t cpuid_avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM +constexpr uint32_t cpuid_sse42_bit = 1 << 20; ///< @private bit 20 of ECX for EAX=0x1 +constexpr uint32_t cpuid_osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1 +constexpr uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit 1 of ECX for EAX=0x1 +} + + + +static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, + uint32_t *edx) { +#if defined(_MSC_VER) + int cpu_info[4]; + __cpuidex(cpu_info, *eax, *ecx); + *eax = cpu_info[0]; + *ebx = cpu_info[1]; + *ecx = cpu_info[2]; + *edx = cpu_info[3]; +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) + uint32_t level = *eax; + __get_cpuid(level, eax, ebx, ecx, edx); +#else + uint32_t a = *eax, b, c = *ecx, d; + asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d)); + *eax = a; + *ebx = b; + *ecx = c; + *edx = d; +#endif +} + + +static inline uint64_t xgetbv() { +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t xcr0_lo, xcr0_hi; + asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0)); + return xcr0_lo | (uint64_t(xcr0_hi) << 32); +#endif +} + +static inline uint32_t detect_supported_architectures() { + uint32_t eax, ebx, ecx, edx; + uint32_t host_isa = 0x0; + + // EBX for EAX=0x1 + eax = 0x1; + ecx = 0x0; + cpuid(&eax, &ebx, &ecx, &edx); + + if (ecx & cpuid_sse42_bit) { + host_isa |= instruction_set::SSE42; + } else { + return host_isa; // everything after is redundant + } + + if (ecx & cpuid_pclmulqdq_bit) { + host_isa |= instruction_set::PCLMULQDQ; + } + + + if ((ecx & cpuid_osxsave) != cpuid_osxsave) { + return host_isa; + } + + // xgetbv for checking if the OS saves registers + uint64_t xcr0 = xgetbv(); + + if ((xcr0 & cpuid_avx256_saved) == 0) { + return host_isa; + } + + // ECX for EAX=0x7 + eax = 0x7; + ecx = 0x0; + cpuid(&eax, &ebx, &ecx, &edx); + if (ebx & cpuid_avx2_bit) { + host_isa |= instruction_set::AVX2; + } + if (ebx & cpuid_bmi1_bit) { + host_isa |= instruction_set::BMI1; + } + + if (ebx & cpuid_bmi2_bit) { + host_isa |= instruction_set::BMI2; + } + + if (!((xcr0 & cpuid_avx512_saved) == cpuid_avx512_saved)) { + return host_isa; + } + + if (ebx & cpuid_avx512f_bit) { + host_isa |= instruction_set::AVX512F; + } + + if (ebx & cpuid_avx512dq_bit) { + host_isa |= instruction_set::AVX512DQ; + } + + if (ebx & cpuid_avx512ifma_bit) { + host_isa |= instruction_set::AVX512IFMA; + } + + if (ebx & cpuid_avx512pf_bit) { + host_isa |= instruction_set::AVX512PF; + } + + if (ebx & cpuid_avx512er_bit) { + host_isa |= instruction_set::AVX512ER; + } + + if (ebx & cpuid_avx512cd_bit) { + host_isa |= instruction_set::AVX512CD; + } + + if (ebx & cpuid_avx512bw_bit) { + host_isa |= instruction_set::AVX512BW; + } + + if (ebx & cpuid_avx512vl_bit) { + host_isa |= instruction_set::AVX512VL; + } + + if (ecx & cpuid_avx512vbmi2_bit) { + host_isa |= instruction_set::AVX512VBMI2; + } + + return host_isa; +} + +#elif defined(__loongarch_sx) && !defined(__loongarch_asx) + +static inline uint32_t detect_supported_architectures() { + return instruction_set::LSX; +} + +#elif defined(__loongarch_asx) + +static inline uint32_t detect_supported_architectures() { + return instruction_set::LASX; +} + +#else // fallback + + +static inline uint32_t detect_supported_architectures() { + return instruction_set::DEFAULT; +} + + +#endif // end SIMD extension detection code + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_INTERNAL_ISADETECTION_H diff --git a/contrib/libs/simdjson/src/internal/jsoncharutils_tables.cpp b/contrib/libs/simdjson/src/internal/jsoncharutils_tables.cpp new file mode 100644 index 000000000000..e16dbf355807 --- /dev/null +++ b/contrib/libs/simdjson/src/internal/jsoncharutils_tables.cpp @@ -0,0 +1,197 @@ +#ifndef SIMDJSON_SRC_JSONCHARUTILS_TABLES_CPP +#define SIMDJSON_SRC_JSONCHARUTILS_TABLES_CPP + +#include + +namespace simdjson { +namespace internal { + +// structural chars here are +// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL) +// we are also interested in the four whitespace characters +// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d + +SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace_negated[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + +SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +SIMDJSON_DLLIMPORTEXPORT const uint32_t digit_to_val32[886] = { + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, + 0x6, 0x7, 0x8, 0x9, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa, + 0xb, 0xc, 0xd, 0xe, 0xf, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xa, 0xb, 0xc, 0xd, 0xe, + 0xf, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, + 0x60, 0x70, 0x80, 0x90, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0, + 0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, + 0xf0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x0, 0x100, 0x200, 0x300, 0x400, 0x500, + 0x600, 0x700, 0x800, 0x900, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00, + 0xb00, 0xc00, 0xd00, 0xe00, 0xf00, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xa00, 0xb00, 0xc00, 0xd00, 0xe00, + 0xf00, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x0, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000, + 0x6000, 0x7000, 0x8000, 0x9000, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000, + 0xb000, 0xc000, 0xd000, 0xe000, 0xf000, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xa000, 0xb000, 0xc000, 0xd000, 0xe000, + 0xf000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}; + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_SRC_JSONCHARUTILS_TABLES_CPP \ No newline at end of file diff --git a/contrib/libs/simdjson/src/internal/numberparsing_tables.cpp b/contrib/libs/simdjson/src/internal/numberparsing_tables.cpp new file mode 100644 index 000000000000..74d97918cd83 --- /dev/null +++ b/contrib/libs/simdjson/src/internal/numberparsing_tables.cpp @@ -0,0 +1,681 @@ +#ifndef SIMDJSON_SRC_NUMBERPARSING_TABLES_CPP +#define SIMDJSON_SRC_NUMBERPARSING_TABLES_CPP + +#include +#include + +// Precomputed powers of ten from 10^0 to 10^22. These +// can be represented exactly using the double type. +SIMDJSON_DLLIMPORTEXPORT const double simdjson::internal::power_of_ten[] = { + 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, + 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22}; + +/** + * When mapping numbers from decimal to binary, + * we go from w * 10^q to m * 2^p but we have + * 10^q = 5^q * 2^q, so effectively + * we are trying to match + * w * 2^q * 5^q to m * 2^p. Thus the powers of two + * are not a concern since they can be represented + * exactly using the binary notation, only the powers of five + * affect the binary significand. + */ + + +// The truncated powers of five from 5^-342 all the way to 5^308 +// The mantissa is truncated to 128 bits, and +// never rounded up. Uses about 10KB. +SIMDJSON_DLLIMPORTEXPORT const uint64_t simdjson::internal::power_of_five_128[]= { + 0xeef453d6923bd65a,0x113faa2906a13b3f, + 0x9558b4661b6565f8,0x4ac7ca59a424c507, + 0xbaaee17fa23ebf76,0x5d79bcf00d2df649, + 0xe95a99df8ace6f53,0xf4d82c2c107973dc, + 0x91d8a02bb6c10594,0x79071b9b8a4be869, + 0xb64ec836a47146f9,0x9748e2826cdee284, + 0xe3e27a444d8d98b7,0xfd1b1b2308169b25, + 0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7, + 0xb208ef855c969f4f,0xbdbd2d335e51a935, + 0xde8b2b66b3bc4723,0xad2c788035e61382, + 0x8b16fb203055ac76,0x4c3bcb5021afcc31, + 0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d, + 0xd953e8624b85dd78,0xd71d6dad34a2af0d, + 0x87d4713d6f33aa6b,0x8672648c40e5ad68, + 0xa9c98d8ccb009506,0x680efdaf511f18c2, + 0xd43bf0effdc0ba48,0x212bd1b2566def2, + 0x84a57695fe98746d,0x14bb630f7604b57, + 0xa5ced43b7e3e9188,0x419ea3bd35385e2d, + 0xcf42894a5dce35ea,0x52064cac828675b9, + 0x818995ce7aa0e1b2,0x7343efebd1940993, + 0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8, + 0xca66fa129f9b60a6,0xd41a26e077774ef6, + 0xfd00b897478238d0,0x8920b098955522b4, + 0x9e20735e8cb16382,0x55b46e5f5d5535b0, + 0xc5a890362fddbc62,0xeb2189f734aa831d, + 0xf712b443bbd52b7b,0xa5e9ec7501d523e4, + 0x9a6bb0aa55653b2d,0x47b233c92125366e, + 0xc1069cd4eabe89f8,0x999ec0bb696e840a, + 0xf148440a256e2c76,0xc00670ea43ca250d, + 0x96cd2a865764dbca,0x380406926a5e5728, + 0xbc807527ed3e12bc,0xc605083704f5ecf2, + 0xeba09271e88d976b,0xf7864a44c633682e, + 0x93445b8731587ea3,0x7ab3ee6afbe0211d, + 0xb8157268fdae9e4c,0x5960ea05bad82964, + 0xe61acf033d1a45df,0x6fb92487298e33bd, + 0x8fd0c16206306bab,0xa5d3b6d479f8e056, + 0xb3c4f1ba87bc8696,0x8f48a4899877186c, + 0xe0b62e2929aba83c,0x331acdabfe94de87, + 0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14, + 0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9, + 0xdb71e91432b1a24a,0xc9e82cd9f69d6150, + 0x892731ac9faf056e,0xbe311c083a225cd2, + 0xab70fe17c79ac6ca,0x6dbd630a48aaf406, + 0xd64d3d9db981787d,0x92cbbccdad5b108, + 0x85f0468293f0eb4e,0x25bbf56008c58ea5, + 0xa76c582338ed2621,0xaf2af2b80af6f24e, + 0xd1476e2c07286faa,0x1af5af660db4aee1, + 0x82cca4db847945ca,0x50d98d9fc890ed4d, + 0xa37fce126597973c,0xe50ff107bab528a0, + 0xcc5fc196fefd7d0c,0x1e53ed49a96272c8, + 0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a, + 0x9faacf3df73609b1,0x77b191618c54e9ac, + 0xc795830d75038c1d,0xd59df5b9ef6a2417, + 0xf97ae3d0d2446f25,0x4b0573286b44ad1d, + 0x9becce62836ac577,0x4ee367f9430aec32, + 0xc2e801fb244576d5,0x229c41f793cda73f, + 0xf3a20279ed56d48a,0x6b43527578c1110f, + 0x9845418c345644d6,0x830a13896b78aaa9, + 0xbe5691ef416bd60c,0x23cc986bc656d553, + 0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8, + 0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9, + 0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53, + 0xe858ad248f5c22c9,0xd1b3400f8f9cff68, + 0x91376c36d99995be,0x23100809b9c21fa1, + 0xb58547448ffffb2d,0xabd40a0c2832a78a, + 0xe2e69915b3fff9f9,0x16c90c8f323f516c, + 0x8dd01fad907ffc3b,0xae3da7d97f6792e3, + 0xb1442798f49ffb4a,0x99cd11cfdf41779c, + 0xdd95317f31c7fa1d,0x40405643d711d583, + 0x8a7d3eef7f1cfc52,0x482835ea666b2572, + 0xad1c8eab5ee43b66,0xda3243650005eecf, + 0xd863b256369d4a40,0x90bed43e40076a82, + 0x873e4f75e2224e68,0x5a7744a6e804a291, + 0xa90de3535aaae202,0x711515d0a205cb36, + 0xd3515c2831559a83,0xd5a5b44ca873e03, + 0x8412d9991ed58091,0xe858790afe9486c2, + 0xa5178fff668ae0b6,0x626e974dbe39a872, + 0xce5d73ff402d98e3,0xfb0a3d212dc8128f, + 0x80fa687f881c7f8e,0x7ce66634bc9d0b99, + 0xa139029f6a239f72,0x1c1fffc1ebc44e80, + 0xc987434744ac874e,0xa327ffb266b56220, + 0xfbe9141915d7a922,0x4bf1ff9f0062baa8, + 0x9d71ac8fada6c9b5,0x6f773fc3603db4a9, + 0xc4ce17b399107c22,0xcb550fb4384d21d3, + 0xf6019da07f549b2b,0x7e2a53a146606a48, + 0x99c102844f94e0fb,0x2eda7444cbfc426d, + 0xc0314325637a1939,0xfa911155fefb5308, + 0xf03d93eebc589f88,0x793555ab7eba27ca, + 0x96267c7535b763b5,0x4bc1558b2f3458de, + 0xbbb01b9283253ca2,0x9eb1aaedfb016f16, + 0xea9c227723ee8bcb,0x465e15a979c1cadc, + 0x92a1958a7675175f,0xbfacd89ec191ec9, + 0xb749faed14125d36,0xcef980ec671f667b, + 0xe51c79a85916f484,0x82b7e12780e7401a, + 0x8f31cc0937ae58d2,0xd1b2ecb8b0908810, + 0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15, + 0xdfbdcece67006ac9,0x67a791e093e1d49a, + 0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0, + 0xaecc49914078536d,0x58fae9f773886e18, + 0xda7f5bf590966848,0xaf39a475506a899e, + 0x888f99797a5e012d,0x6d8406c952429603, + 0xaab37fd7d8f58178,0xc8e5087ba6d33b83, + 0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64, + 0x855c3be0a17fcd26,0x5cf2eea09a55067f, + 0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e, + 0xd0601d8efc57b08b,0xf13b94daf124da26, + 0x823c12795db6ce57,0x76c53d08d6b70858, + 0xa2cb1717b52481ed,0x54768c4b0c64ca6e, + 0xcb7ddcdda26da268,0xa9942f5dcf7dfd09, + 0xfe5d54150b090b02,0xd3f93b35435d7c4c, + 0x9efa548d26e5a6e1,0xc47bc5014a1a6daf, + 0xc6b8e9b0709f109a,0x359ab6419ca1091b, + 0xf867241c8cc6d4c0,0xc30163d203c94b62, + 0x9b407691d7fc44f8,0x79e0de63425dcf1d, + 0xc21094364dfb5636,0x985915fc12f542e4, + 0xf294b943e17a2bc4,0x3e6f5b7b17b2939d, + 0x979cf3ca6cec5b5a,0xa705992ceecf9c42, + 0xbd8430bd08277231,0x50c6ff782a838353, + 0xece53cec4a314ebd,0xa4f8bf5635246428, + 0x940f4613ae5ed136,0x871b7795e136be99, + 0xb913179899f68584,0x28e2557b59846e3f, + 0xe757dd7ec07426e5,0x331aeada2fe589cf, + 0x9096ea6f3848984f,0x3ff0d2c85def7621, + 0xb4bca50b065abe63,0xfed077a756b53a9, + 0xe1ebce4dc7f16dfb,0xd3e8495912c62894, + 0x8d3360f09cf6e4bd,0x64712dd7abbbd95c, + 0xb080392cc4349dec,0xbd8d794d96aacfb3, + 0xdca04777f541c567,0xecf0d7a0fc5583a0, + 0x89e42caaf9491b60,0xf41686c49db57244, + 0xac5d37d5b79b6239,0x311c2875c522ced5, + 0xd77485cb25823ac7,0x7d633293366b828b, + 0x86a8d39ef77164bc,0xae5dff9c02033197, + 0xa8530886b54dbdeb,0xd9f57f830283fdfc, + 0xd267caa862a12d66,0xd072df63c324fd7b, + 0x8380dea93da4bc60,0x4247cb9e59f71e6d, + 0xa46116538d0deb78,0x52d9be85f074e608, + 0xcd795be870516656,0x67902e276c921f8b, + 0x806bd9714632dff6,0xba1cd8a3db53b6, + 0xa086cfcd97bf97f3,0x80e8a40eccd228a4, + 0xc8a883c0fdaf7df0,0x6122cd128006b2cd, + 0xfad2a4b13d1b5d6c,0x796b805720085f81, + 0x9cc3a6eec6311a63,0xcbe3303674053bb0, + 0xc3f490aa77bd60fc,0xbedbfc4411068a9c, + 0xf4f1b4d515acb93b,0xee92fb5515482d44, + 0x991711052d8bf3c5,0x751bdd152d4d1c4a, + 0xbf5cd54678eef0b6,0xd262d45a78a0635d, + 0xef340a98172aace4,0x86fb897116c87c34, + 0x9580869f0e7aac0e,0xd45d35e6ae3d4da0, + 0xbae0a846d2195712,0x8974836059cca109, + 0xe998d258869facd7,0x2bd1a438703fc94b, + 0x91ff83775423cc06,0x7b6306a34627ddcf, + 0xb67f6455292cbf08,0x1a3bc84c17b1d542, + 0xe41f3d6a7377eeca,0x20caba5f1d9e4a93, + 0x8e938662882af53e,0x547eb47b7282ee9c, + 0xb23867fb2a35b28d,0xe99e619a4f23aa43, + 0xdec681f9f4c31f31,0x6405fa00e2ec94d4, + 0x8b3c113c38f9f37e,0xde83bc408dd3dd04, + 0xae0b158b4738705e,0x9624ab50b148d445, + 0xd98ddaee19068c76,0x3badd624dd9b0957, + 0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6, + 0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c, + 0xd47487cc8470652b,0x7647c3200069671f, + 0x84c8d4dfd2c63f3b,0x29ecd9f40041e073, + 0xa5fb0a17c777cf09,0xf468107100525890, + 0xcf79cc9db955c2cc,0x7182148d4066eeb4, + 0x81ac1fe293d599bf,0xc6f14cd848405530, + 0xa21727db38cb002f,0xb8ada00e5a506a7c, + 0xca9cf1d206fdc03b,0xa6d90811f0e4851c, + 0xfd442e4688bd304a,0x908f4a166d1da663, + 0x9e4a9cec15763e2e,0x9a598e4e043287fe, + 0xc5dd44271ad3cdba,0x40eff1e1853f29fd, + 0xf7549530e188c128,0xd12bee59e68ef47c, + 0x9a94dd3e8cf578b9,0x82bb74f8301958ce, + 0xc13a148e3032d6e7,0xe36a52363c1faf01, + 0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1, + 0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9, + 0xbcb2b812db11a5de,0x7415d448f6b6f0e7, + 0xebdf661791d60f56,0x111b495b3464ad21, + 0x936b9fcebb25c995,0xcab10dd900beec34, + 0xb84687c269ef3bfb,0x3d5d514f40eea742, + 0xe65829b3046b0afa,0xcb4a5a3112a5112, + 0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab, + 0xb3f4e093db73a093,0x59ed216765690f56, + 0xe0f218b8d25088b8,0x306869c13ec3532c, + 0x8c974f7383725573,0x1e414218c73a13fb, + 0xafbd2350644eeacf,0xe5d1929ef90898fa, + 0xdbac6c247d62a583,0xdf45f746b74abf39, + 0x894bc396ce5da772,0x6b8bba8c328eb783, + 0xab9eb47c81f5114f,0x66ea92f3f326564, + 0xd686619ba27255a2,0xc80a537b0efefebd, + 0x8613fd0145877585,0xbd06742ce95f5f36, + 0xa798fc4196e952e7,0x2c48113823b73704, + 0xd17f3b51fca3a7a0,0xf75a15862ca504c5, + 0x82ef85133de648c4,0x9a984d73dbe722fb, + 0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba, + 0xcc963fee10b7d1b3,0x318df905079926a8, + 0xffbbcfe994e5c61f,0xfdf17746497f7052, + 0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633, + 0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0, + 0xf9bd690a1b68637b,0x3dfdce7aa3c673b0, + 0x9c1661a651213e2d,0x6bea10ca65c084e, + 0xc31bfa0fe5698db8,0x486e494fcff30a62, + 0xf3e2f893dec3f126,0x5a89dba3c3efccfa, + 0x986ddb5c6b3a76b7,0xf89629465a75e01c, + 0xbe89523386091465,0xf6bbb397f1135823, + 0xee2ba6c0678b597f,0x746aa07ded582e2c, + 0x94db483840b717ef,0xa8c2a44eb4571cdc, + 0xba121a4650e4ddeb,0x92f34d62616ce413, + 0xe896a0d7e51e1566,0x77b020baf9c81d17, + 0x915e2486ef32cd60,0xace1474dc1d122e, + 0xb5b5ada8aaff80b8,0xd819992132456ba, + 0xe3231912d5bf60e6,0x10e1fff697ed6c69, + 0x8df5efabc5979c8f,0xca8d3ffa1ef463c1, + 0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2, + 0xddd0467c64bce4a0,0xac7cb3f6d05ddbde, + 0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b, + 0xad4ab7112eb3929d,0x86c16c98d2c953c6, + 0xd89d64d57a607744,0xe871c7bf077ba8b7, + 0x87625f056c7c4a8b,0x11471cd764ad4972, + 0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf, + 0xd389b47879823479,0x4aff1d108d4ec2c3, + 0x843610cb4bf160cb,0xcedf722a585139ba, + 0xa54394fe1eedb8fe,0xc2974eb4ee658828, + 0xce947a3da6a9273e,0x733d226229feea32, + 0x811ccc668829b887,0x806357d5a3f525f, + 0xa163ff802a3426a8,0xca07c2dcb0cf26f7, + 0xc9bcff6034c13052,0xfc89b393dd02f0b5, + 0xfc2c3f3841f17c67,0xbbac2078d443ace2, + 0x9d9ba7832936edc0,0xd54b944b84aa4c0d, + 0xc5029163f384a931,0xa9e795e65d4df11, + 0xf64335bcf065d37d,0x4d4617b5ff4a16d5, + 0x99ea0196163fa42e,0x504bced1bf8e4e45, + 0xc06481fb9bcf8d39,0xe45ec2862f71e1d6, + 0xf07da27a82c37088,0x5d767327bb4e5a4c, + 0x964e858c91ba2655,0x3a6a07f8d510f86f, + 0xbbe226efb628afea,0x890489f70a55368b, + 0xeadab0aba3b2dbe5,0x2b45ac74ccea842e, + 0x92c8ae6b464fc96f,0x3b0b8bc90012929d, + 0xb77ada0617e3bbcb,0x9ce6ebb40173744, + 0xe55990879ddcaabd,0xcc420a6a101d0515, + 0x8f57fa54c2a9eab6,0x9fa946824a12232d, + 0xb32df8e9f3546564,0x47939822dc96abf9, + 0xdff9772470297ebd,0x59787e2b93bc56f7, + 0x8bfbea76c619ef36,0x57eb4edb3c55b65a, + 0xaefae51477a06b03,0xede622920b6b23f1, + 0xdab99e59958885c4,0xe95fab368e45eced, + 0x88b402f7fd75539b,0x11dbcb0218ebb414, + 0xaae103b5fcd2a881,0xd652bdc29f26a119, + 0xd59944a37c0752a2,0x4be76d3346f0495f, + 0x857fcae62d8493a5,0x6f70a4400c562ddb, + 0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952, + 0xd097ad07a71f26b2,0x7e2000a41346a7a7, + 0x825ecc24c873782f,0x8ed400668c0c28c8, + 0xa2f67f2dfa90563b,0x728900802f0f32fa, + 0xcbb41ef979346bca,0x4f2b40a03ad2ffb9, + 0xfea126b7d78186bc,0xe2f610c84987bfa8, + 0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9, + 0xc6ede63fa05d3143,0x91503d1c79720dbb, + 0xf8a95fcf88747d94,0x75a44c6397ce912a, + 0x9b69dbe1b548ce7c,0xc986afbe3ee11aba, + 0xc24452da229b021b,0xfbe85badce996168, + 0xf2d56790ab41c2a2,0xfae27299423fb9c3, + 0x97c560ba6b0919a5,0xdccd879fc967d41a, + 0xbdb6b8e905cb600f,0x5400e987bbc1c920, + 0xed246723473e3813,0x290123e9aab23b68, + 0x9436c0760c86e30b,0xf9a0b6720aaf6521, + 0xb94470938fa89bce,0xf808e40e8d5b3e69, + 0xe7958cb87392c2c2,0xb60b1d1230b20e04, + 0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2, + 0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3, + 0xe2280b6c20dd5232,0x25c6da63c38de1b0, + 0x8d590723948a535f,0x579c487e5a38ad0e, + 0xb0af48ec79ace837,0x2d835a9df0c6d851, + 0xdcdb1b2798182244,0xf8e431456cf88e65, + 0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff, + 0xac8b2d36eed2dac5,0xe272467e3d222f3f, + 0xd7adf884aa879177,0x5b0ed81dcc6abb0f, + 0x86ccbb52ea94baea,0x98e947129fc2b4e9, + 0xa87fea27a539e9a5,0x3f2398d747b36224, + 0xd29fe4b18e88640e,0x8eec7f0d19a03aad, + 0x83a3eeeef9153e89,0x1953cf68300424ac, + 0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7, + 0xcdb02555653131b6,0x3792f412cb06794d, + 0x808e17555f3ebf11,0xe2bbd88bbee40bd0, + 0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4, + 0xc8de047564d20a8b,0xf245825a5a445275, + 0xfb158592be068d2e,0xeed6e2f0f0d56712, + 0x9ced737bb6c4183d,0x55464dd69685606b, + 0xc428d05aa4751e4c,0xaa97e14c3c26b886, + 0xf53304714d9265df,0xd53dd99f4b3066a8, + 0x993fe2c6d07b7fab,0xe546a8038efe4029, + 0xbf8fdb78849a5f96,0xde98520472bdd033, + 0xef73d256a5c0f77c,0x963e66858f6d4440, + 0x95a8637627989aad,0xdde7001379a44aa8, + 0xbb127c53b17ec159,0x5560c018580d5d52, + 0xe9d71b689dde71af,0xaab8f01e6e10b4a6, + 0x9226712162ab070d,0xcab3961304ca70e8, + 0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22, + 0xe45c10c42a2b3b05,0x8cb89a7db77c506a, + 0x8eb98a7a9a5b04e3,0x77f3608e92adb242, + 0xb267ed1940f1c61c,0x55f038b237591ed3, + 0xdf01e85f912e37a3,0x6b6c46dec52f6688, + 0x8b61313bbabce2c6,0x2323ac4b3b3da015, + 0xae397d8aa96c1b77,0xabec975e0a0d081a, + 0xd9c7dced53c72255,0x96e7bd358c904a21, + 0x881cea14545c7575,0x7e50d64177da2e54, + 0xaa242499697392d2,0xdde50bd1d5d0b9e9, + 0xd4ad2dbfc3d07787,0x955e4ec64b44e864, + 0x84ec3c97da624ab4,0xbd5af13bef0b113e, + 0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e, + 0xcfb11ead453994ba,0x67de18eda5814af2, + 0x81ceb32c4b43fcf4,0x80eacf948770ced7, + 0xa2425ff75e14fc31,0xa1258379a94d028d, + 0xcad2f7f5359a3b3e,0x96ee45813a04330, + 0xfd87b5f28300ca0d,0x8bca9d6e188853fc, + 0x9e74d1b791e07e48,0x775ea264cf55347e, + 0xc612062576589dda,0x95364afe032a81a0, + 0xf79687aed3eec551,0x3a83ddbd83f52210, + 0x9abe14cd44753b52,0xc4926a9672793580, + 0xc16d9a0095928a27,0x75b7053c0f178400, + 0xf1c90080baf72cb1,0x5324c68b12dd6800, + 0x971da05074da7bee,0xd3f6fc16ebca8000, + 0xbce5086492111aea,0x88f4bb1ca6bd0000, + 0xec1e4a7db69561a5,0x2b31e9e3d0700000, + 0x9392ee8e921d5d07,0x3aff322e62600000, + 0xb877aa3236a4b449,0x9befeb9fad487c3, + 0xe69594bec44de15b,0x4c2ebe687989a9b4, + 0x901d7cf73ab0acd9,0xf9d37014bf60a11, + 0xb424dc35095cd80f,0x538484c19ef38c95, + 0xe12e13424bb40e13,0x2865a5f206b06fba, + 0x8cbccc096f5088cb,0xf93f87b7442e45d4, + 0xafebff0bcb24aafe,0xf78f69a51539d749, + 0xdbe6fecebdedd5be,0xb573440e5a884d1c, + 0x89705f4136b4a597,0x31680a88f8953031, + 0xabcc77118461cefc,0xfdc20d2b36ba7c3e, + 0xd6bf94d5e57a42bc,0x3d32907604691b4d, + 0x8637bd05af6c69b5,0xa63f9a49c2c1b110, + 0xa7c5ac471b478423,0xfcf80dc33721d54, + 0xd1b71758e219652b,0xd3c36113404ea4a9, + 0x83126e978d4fdf3b,0x645a1cac083126ea, + 0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4, + 0xcccccccccccccccc,0xcccccccccccccccd, + 0x8000000000000000,0x0, + 0xa000000000000000,0x0, + 0xc800000000000000,0x0, + 0xfa00000000000000,0x0, + 0x9c40000000000000,0x0, + 0xc350000000000000,0x0, + 0xf424000000000000,0x0, + 0x9896800000000000,0x0, + 0xbebc200000000000,0x0, + 0xee6b280000000000,0x0, + 0x9502f90000000000,0x0, + 0xba43b74000000000,0x0, + 0xe8d4a51000000000,0x0, + 0x9184e72a00000000,0x0, + 0xb5e620f480000000,0x0, + 0xe35fa931a0000000,0x0, + 0x8e1bc9bf04000000,0x0, + 0xb1a2bc2ec5000000,0x0, + 0xde0b6b3a76400000,0x0, + 0x8ac7230489e80000,0x0, + 0xad78ebc5ac620000,0x0, + 0xd8d726b7177a8000,0x0, + 0x878678326eac9000,0x0, + 0xa968163f0a57b400,0x0, + 0xd3c21bcecceda100,0x0, + 0x84595161401484a0,0x0, + 0xa56fa5b99019a5c8,0x0, + 0xcecb8f27f4200f3a,0x0, + 0x813f3978f8940984,0x4000000000000000, + 0xa18f07d736b90be5,0x5000000000000000, + 0xc9f2c9cd04674ede,0xa400000000000000, + 0xfc6f7c4045812296,0x4d00000000000000, + 0x9dc5ada82b70b59d,0xf020000000000000, + 0xc5371912364ce305,0x6c28000000000000, + 0xf684df56c3e01bc6,0xc732000000000000, + 0x9a130b963a6c115c,0x3c7f400000000000, + 0xc097ce7bc90715b3,0x4b9f100000000000, + 0xf0bdc21abb48db20,0x1e86d40000000000, + 0x96769950b50d88f4,0x1314448000000000, + 0xbc143fa4e250eb31,0x17d955a000000000, + 0xeb194f8e1ae525fd,0x5dcfab0800000000, + 0x92efd1b8d0cf37be,0x5aa1cae500000000, + 0xb7abc627050305ad,0xf14a3d9e40000000, + 0xe596b7b0c643c719,0x6d9ccd05d0000000, + 0x8f7e32ce7bea5c6f,0xe4820023a2000000, + 0xb35dbf821ae4f38b,0xdda2802c8a800000, + 0xe0352f62a19e306e,0xd50b2037ad200000, + 0x8c213d9da502de45,0x4526f422cc340000, + 0xaf298d050e4395d6,0x9670b12b7f410000, + 0xdaf3f04651d47b4c,0x3c0cdd765f114000, + 0x88d8762bf324cd0f,0xa5880a69fb6ac800, + 0xab0e93b6efee0053,0x8eea0d047a457a00, + 0xd5d238a4abe98068,0x72a4904598d6d880, + 0x85a36366eb71f041,0x47a6da2b7f864750, + 0xa70c3c40a64e6c51,0x999090b65f67d924, + 0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d, + 0x82818f1281ed449f,0xbff8f10e7a8921a4, + 0xa321f2d7226895c7,0xaff72d52192b6a0d, + 0xcbea6f8ceb02bb39,0x9bf4f8a69f764490, + 0xfee50b7025c36a08,0x2f236d04753d5b4, + 0x9f4f2726179a2245,0x1d762422c946590, + 0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5, + 0xf8ebad2b84e0d58b,0xd2e0898765a7deb2, + 0x9b934c3b330c8577,0x63cc55f49f88eb2f, + 0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb, + 0xf316271c7fc3908a,0x8bef464e3945ef7a, + 0x97edd871cfda3a56,0x97758bf0e3cbb5ac, + 0xbde94e8e43d0c8ec,0x3d52eeed1cbea317, + 0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd, + 0x945e455f24fb1cf8,0x8fe8caa93e74ef6a, + 0xb975d6b6ee39e436,0xb3e2fd538e122b44, + 0xe7d34c64a9c85d44,0x60dbbca87196b616, + 0x90e40fbeea1d3a4a,0xbc8955e946fe31cd, + 0xb51d13aea4a488dd,0x6babab6398bdbe41, + 0xe264589a4dcdab14,0xc696963c7eed2dd1, + 0x8d7eb76070a08aec,0xfc1e1de5cf543ca2, + 0xb0de65388cc8ada8,0x3b25a55f43294bcb, + 0xdd15fe86affad912,0x49ef0eb713f39ebe, + 0x8a2dbf142dfcc7ab,0x6e3569326c784337, + 0xacb92ed9397bf996,0x49c2c37f07965404, + 0xd7e77a8f87daf7fb,0xdc33745ec97be906, + 0x86f0ac99b4e8dafd,0x69a028bb3ded71a3, + 0xa8acd7c0222311bc,0xc40832ea0d68ce0c, + 0xd2d80db02aabd62b,0xf50a3fa490c30190, + 0x83c7088e1aab65db,0x792667c6da79e0fa, + 0xa4b8cab1a1563f52,0x577001b891185938, + 0xcde6fd5e09abcf26,0xed4c0226b55e6f86, + 0x80b05e5ac60b6178,0x544f8158315b05b4, + 0xa0dc75f1778e39d6,0x696361ae3db1c721, + 0xc913936dd571c84c,0x3bc3a19cd1e38e9, + 0xfb5878494ace3a5f,0x4ab48a04065c723, + 0x9d174b2dcec0e47b,0x62eb0d64283f9c76, + 0xc45d1df942711d9a,0x3ba5d0bd324f8394, + 0xf5746577930d6500,0xca8f44ec7ee36479, + 0x9968bf6abbe85f20,0x7e998b13cf4e1ecb, + 0xbfc2ef456ae276e8,0x9e3fedd8c321a67e, + 0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e, + 0x95d04aee3b80ece5,0xbba1f1d158724a12, + 0xbb445da9ca61281f,0x2a8a6e45ae8edc97, + 0xea1575143cf97226,0xf52d09d71a3293bd, + 0x924d692ca61be758,0x593c2626705f9c56, + 0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c, + 0xe498f455c38b997a,0xb6dfb9c0f956447, + 0x8edf98b59a373fec,0x4724bd4189bd5eac, + 0xb2977ee300c50fe7,0x58edec91ec2cb657, + 0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed, + 0x8b865b215899f46c,0xbd79e0d20082ee74, + 0xae67f1e9aec07187,0xecd8590680a3aa11, + 0xda01ee641a708de9,0xe80e6f4820cc9495, + 0x884134fe908658b2,0x3109058d147fdcdd, + 0xaa51823e34a7eede,0xbd4b46f0599fd415, + 0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a, + 0x850fadc09923329e,0x3e2cf6bc604ddb0, + 0xa6539930bf6bff45,0x84db8346b786151c, + 0xcfe87f7cef46ff16,0xe612641865679a63, + 0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e, + 0xa26da3999aef7749,0xe3be5e330f38f09d, + 0xcb090c8001ab551c,0x5cadf5bfd3072cc5, + 0xfdcb4fa002162a63,0x73d9732fc7c8f7f6, + 0x9e9f11c4014dda7e,0x2867e7fddcdd9afa, + 0xc646d63501a1511d,0xb281e1fd541501b8, + 0xf7d88bc24209a565,0x1f225a7ca91a4226, + 0x9ae757596946075f,0x3375788de9b06958, + 0xc1a12d2fc3978937,0x52d6b1641c83ae, + 0xf209787bb47d6b84,0xc0678c5dbd23a49a, + 0x9745eb4d50ce6332,0xf840b7ba963646e0, + 0xbd176620a501fbff,0xb650e5a93bc3d898, + 0xec5d3fa8ce427aff,0xa3e51f138ab4cebe, + 0x93ba47c980e98cdf,0xc66f336c36b10137, + 0xb8a8d9bbe123f017,0xb80b0047445d4184, + 0xe6d3102ad96cec1d,0xa60dc059157491e5, + 0x9043ea1ac7e41392,0x87c89837ad68db2f, + 0xb454e4a179dd1877,0x29babe4598c311fb, + 0xe16a1dc9d8545e94,0xf4296dd6fef3d67a, + 0x8ce2529e2734bb1d,0x1899e4a65f58660c, + 0xb01ae745b101e9e4,0x5ec05dcff72e7f8f, + 0xdc21a1171d42645d,0x76707543f4fa1f73, + 0x899504ae72497eba,0x6a06494a791c53a8, + 0xabfa45da0edbde69,0x487db9d17636892, + 0xd6f8d7509292d603,0x45a9d2845d3c42b6, + 0x865b86925b9bc5c2,0xb8a2392ba45a9b2, + 0xa7f26836f282b732,0x8e6cac7768d7141e, + 0xd1ef0244af2364ff,0x3207d795430cd926, + 0x8335616aed761f1f,0x7f44e6bd49e807b8, + 0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6, + 0xcd036837130890a1,0x36dba887c37a8c0f, + 0x802221226be55a64,0xc2494954da2c9789, + 0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c, + 0xc83553c5c8965d3d,0x6f92829494e5acc7, + 0xfa42a8b73abbf48c,0xcb772339ba1f17f9, + 0x9c69a97284b578d7,0xff2a760414536efb, + 0xc38413cf25e2d70d,0xfef5138519684aba, + 0xf46518c2ef5b8cd1,0x7eb258665fc25d69, + 0x98bf2f79d5993802,0xef2f773ffbd97a61, + 0xbeeefb584aff8603,0xaafb550ffacfd8fa, + 0xeeaaba2e5dbf6784,0x95ba2a53f983cf38, + 0x952ab45cfa97a0b2,0xdd945a747bf26183, + 0xba756174393d88df,0x94f971119aeef9e4, + 0xe912b9d1478ceb17,0x7a37cd5601aab85d, + 0x91abb422ccb812ee,0xac62e055c10ab33a, + 0xb616a12b7fe617aa,0x577b986b314d6009, + 0xe39c49765fdf9d94,0xed5a7e85fda0b80b, + 0x8e41ade9fbebc27d,0x14588f13be847307, + 0xb1d219647ae6b31c,0x596eb2d8ae258fc8, + 0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb, + 0x8aec23d680043bee,0x25de7bb9480d5854, + 0xada72ccc20054ae9,0xaf561aa79a10ae6a, + 0xd910f7ff28069da4,0x1b2ba1518094da04, + 0x87aa9aff79042286,0x90fb44d2f05d0842, + 0xa99541bf57452b28,0x353a1607ac744a53, + 0xd3fa922f2d1675f2,0x42889b8997915ce8, + 0x847c9b5d7c2e09b7,0x69956135febada11, + 0xa59bc234db398c25,0x43fab9837e699095, + 0xcf02b2c21207ef2e,0x94f967e45e03f4bb, + 0x8161afb94b44f57d,0x1d1be0eebac278f5, + 0xa1ba1ba79e1632dc,0x6462d92a69731732, + 0xca28a291859bbf93,0x7d7b8f7503cfdcfe, + 0xfcb2cb35e702af78,0x5cda735244c3d43e, + 0x9defbf01b061adab,0x3a0888136afa64a7, + 0xc56baec21c7a1916,0x88aaa1845b8fdd0, + 0xf6c69a72a3989f5b,0x8aad549e57273d45, + 0x9a3c2087a63f6399,0x36ac54e2f678864b, + 0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd, + 0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5, + 0x969eb7c47859e743,0x9f644ae5a4b1b325, + 0xbc4665b596706114,0x873d5d9f0dde1fee, + 0xeb57ff22fc0c7959,0xa90cb506d155a7ea, + 0x9316ff75dd87cbd8,0x9a7f12442d588f2, + 0xb7dcbf5354e9bece,0xc11ed6d538aeb2f, + 0xe5d3ef282a242e81,0x8f1668c8a86da5fa, + 0x8fa475791a569d10,0xf96e017d694487bc, + 0xb38d92d760ec4455,0x37c981dcc395a9ac, + 0xe070f78d3927556a,0x85bbe253f47b1417, + 0x8c469ab843b89562,0x93956d7478ccec8e, + 0xaf58416654a6babb,0x387ac8d1970027b2, + 0xdb2e51bfe9d0696a,0x6997b05fcc0319e, + 0x88fcf317f22241e2,0x441fece3bdf81f03, + 0xab3c2fddeeaad25a,0xd527e81cad7626c3, + 0xd60b3bd56a5586f1,0x8a71e223d8d3b074, + 0x85c7056562757456,0xf6872d5667844e49, + 0xa738c6bebb12d16c,0xb428f8ac016561db, + 0xd106f86e69d785c7,0xe13336d701beba52, + 0x82a45b450226b39c,0xecc0024661173473, + 0xa34d721642b06084,0x27f002d7f95d0190, + 0xcc20ce9bd35c78a5,0x31ec038df7b441f4, + 0xff290242c83396ce,0x7e67047175a15271, + 0x9f79a169bd203e41,0xf0062c6e984d386, + 0xc75809c42c684dd1,0x52c07b78a3e60868, + 0xf92e0c3537826145,0xa7709a56ccdf8a82, + 0x9bbcc7a142b17ccb,0x88a66076400bb691, + 0xc2abf989935ddbfe,0x6acff893d00ea435, + 0xf356f7ebf83552fe,0x583f6b8c4124d43, + 0x98165af37b2153de,0xc3727a337a8b704a, + 0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c, + 0xeda2ee1c7064130c,0x1162def06f79df73, + 0x9485d4d1c63e8be7,0x8addcb5645ac2ba8, + 0xb9a74a0637ce2ee1,0x6d953e2bd7173692, + 0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437, + 0x910ab1d4db9914a0,0x1d9c9892400a22a2, + 0xb54d5e4a127f59c8,0x2503beb6d00cab4b, + 0xe2a0b5dc971f303a,0x2e44ae64840fd61d, + 0x8da471a9de737e24,0x5ceaecfed289e5d2, + 0xb10d8e1456105dad,0x7425a83e872c5f47, + 0xdd50f1996b947518,0xd12f124e28f77719, + 0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f, + 0xace73cbfdc0bfb7b,0x636cc64d1001550b, + 0xd8210befd30efa5a,0x3c47f7e05401aa4e, + 0x8714a775e3e95c78,0x65acfaec34810a71, + 0xa8d9d1535ce3b396,0x7f1839a741a14d0d, + 0xd31045a8341ca07c,0x1ede48111209a050, + 0x83ea2b892091e44d,0x934aed0aab460432, + 0xa4e4b66b68b65d60,0xf81da84d5617853f, + 0xce1de40642e3f4b9,0x36251260ab9d668e, + 0x80d2ae83e9ce78f3,0xc1d72b7c6b426019, + 0xa1075a24e4421730,0xb24cf65b8612f81f, + 0xc94930ae1d529cfc,0xdee033f26797b627, + 0xfb9b7cd9a4a7443c,0x169840ef017da3b1, + 0x9d412e0806e88aa5,0x8e1f289560ee864e, + 0xc491798a08a2ad4e,0xf1a6f2bab92a27e2, + 0xf5b5d7ec8acb58a2,0xae10af696774b1db, + 0x9991a6f3d6bf1765,0xacca6da1e0a8ef29, + 0xbff610b0cc6edd3f,0x17fd090a58d32af3, + 0xeff394dcff8a948e,0xddfc4b4cef07f5b0, + 0x95f83d0a1fb69cd9,0x4abdaf101564f98e, + 0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1, + 0xea53df5fd18d5513,0x84c86189216dc5ed, + 0x92746b9be2f8552c,0x32fd3cf5b4e49bb4, + 0xb7118682dbb66a77,0x3fbc8c33221dc2a1, + 0xe4d5e82392a40515,0xfabaf3feaa5334a, + 0x8f05b1163ba6832d,0x29cb4d87f2a7400e, + 0xb2c71d5bca9023f8,0x743e20e9ef511012, + 0xdf78e4b2bd342cf6,0x914da9246b255416, + 0x8bab8eefb6409c1a,0x1ad089b6c2f7548e, + 0xae9672aba3d0c320,0xa184ac2473b529b1, + 0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e, + 0x8865899617fb1871,0x7e2fa67c7a658892, + 0xaa7eebfb9df9de8d,0xddbb901b98feeab7, + 0xd51ea6fa85785631,0x552a74227f3ea565, + 0x8533285c936b35de,0xd53a88958f87275f, + 0xa67ff273b8460356,0x8a892abaf368f137, + 0xd01fef10a657842c,0x2d2b7569b0432d85, + 0x8213f56a67f6b29b,0x9c3b29620e29fc73, + 0xa298f2c501f45f42,0x8349f3ba91b47b8f, + 0xcb3f2f7642717713,0x241c70a936219a73, + 0xfe0efb53d30dd4d7,0xed238cd383aa0110, + 0x9ec95d1463e8a506,0xf4363804324a40aa, + 0xc67bb4597ce2ce48,0xb143c6053edcd0d5, + 0xf81aa16fdc1b81da,0xdd94b7868e94050a, + 0x9b10a4e5e9913128,0xca7cf2b4191c8326, + 0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0, + 0xf24a01a73cf2dccf,0xbc633b39673c8cec, + 0x976e41088617ca01,0xd5be0503e085d813, + 0xbd49d14aa79dbc82,0x4b2d8644d8a74e18, + 0xec9c459d51852ba2,0xddf8e7d60ed1219e, + 0x93e1ab8252f33b45,0xcabb90e5c942b503, + 0xb8da1662e7b00a17,0x3d6a751f3b936243, + 0xe7109bfba19c0c9d,0xcc512670a783ad4, + 0x906a617d450187e2,0x27fb2b80668b24c5, + 0xb484f9dc9641e9da,0xb1f9f660802dedf6, + 0xe1a63853bbd26451,0x5e7873f8a0396973, + 0x8d07e33455637eb2,0xdb0b487b6423e1e8, + 0xb049dc016abc5e5f,0x91ce1a9a3d2cda62, + 0xdc5c5301c56b75f7,0x7641a140cc7810fb, + 0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d, + 0xac2820d9623bf429,0x546345fa9fbdcd44, + 0xd732290fbacaf133,0xa97c177947ad4095, + 0x867f59a9d4bed6c0,0x49ed8eabcccc485d, + 0xa81f301449ee8c70,0x5c68f256bfff5a74, + 0xd226fc195c6a2f8c,0x73832eec6fff3111, + 0x83585d8fd9c25db7,0xc831fd53c5ff7eab, + 0xa42e74f3d032f525,0xba3e7ca8b77f5e55, + 0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb, + 0x80444b5e7aa7cf85,0x7980d163cf5b81b3, + 0xa0555e361951c366,0xd7e105bcc332621f, + 0xc86ab5c39fa63440,0x8dd9472bf3fefaa7, + 0xfa856334878fc150,0xb14f98f6f0feb951, + 0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3, + 0xc3b8358109e84f07,0xa862f80ec4700c8, + 0xf4a642e14c6262c8,0xcd27bb612758c0fa, + 0x98e7e9cccfbd7dbd,0x8038d51cb897789c, + 0xbf21e44003acdd2c,0xe0470a63e6bd56c3, + 0xeeea5d5004981478,0x1858ccfce06cac74, + 0x95527a5202df0ccb,0xf37801e0c43ebc8, + 0xbaa718e68396cffd,0xd30560258f54e6ba, + 0xe950df20247c83fd,0x47c6b82ef32a2069, + 0x91d28b7416cdd27e,0x4cdc331d57fa5441, + 0xb6472e511c81471d,0xe0133fe4adf8e952, + 0xe3d8f9e563a198e5,0x58180fddd97723a6, + 0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,}; + +#endif // SIMDJSON_SRC_NUMBERPARSING_TABLES_CPP \ No newline at end of file diff --git a/contrib/libs/simdjson/src/internal/simdprune_tables.cpp b/contrib/libs/simdjson/src/internal/simdprune_tables.cpp new file mode 100644 index 000000000000..6b159944bcb7 --- /dev/null +++ b/contrib/libs/simdjson/src/internal/simdprune_tables.cpp @@ -0,0 +1,138 @@ +#ifndef SIMDJSON_SRC_SIMDPRUNE_TABLES_CPP +#define SIMDJSON_SRC_SIMDPRUNE_TABLES_CPP + +#include + +#if SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_ICELAKE || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64 || SIMDJSON_IMPLEMENTATION_LSX || SIMDJSON_IMPLEMENTATION_LASX + +#include + +namespace simdjson { // table modified and copied from +namespace internal { // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable +SIMDJSON_DLLIMPORTEXPORT const unsigned char BitsSetTable256mul2[256] = { + 0, 2, 2, 4, 2, 4, 4, 6, 2, 4, 4, 6, 4, 6, 6, 8, 2, 4, 4, + 6, 4, 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 2, 4, 4, 6, 4, 6, + 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6, + 8, 8, 10, 8, 10, 10, 12, 2, 4, 4, 6, 4, 6, 6, 8, 4, 6, 6, 8, + 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, + 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6, 8, + 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 2, 4, 4, 6, 4, + 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, + 6, 8, 8, 10, 8, 10, 10, 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, + 10, 8, 10, 10, 12, 6, 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, + 12, 14, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6, + 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 6, 8, 8, 10, + 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 8, 10, 10, 12, 10, 12, 12, + 14, 10, 12, 12, 14, 12, 14, 14, 16}; + +SIMDJSON_DLLIMPORTEXPORT const uint8_t pshufb_combine_table[272] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, + 0x0f, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x08, + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, + 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; + +// 256 * 8 bytes = 2kB, easily fits in cache. +SIMDJSON_DLLIMPORTEXPORT const uint64_t thintable_epi8[256] = { + 0x0706050403020100, 0x0007060504030201, 0x0007060504030200, + 0x0000070605040302, 0x0007060504030100, 0x0000070605040301, + 0x0000070605040300, 0x0000000706050403, 0x0007060504020100, + 0x0000070605040201, 0x0000070605040200, 0x0000000706050402, + 0x0000070605040100, 0x0000000706050401, 0x0000000706050400, + 0x0000000007060504, 0x0007060503020100, 0x0000070605030201, + 0x0000070605030200, 0x0000000706050302, 0x0000070605030100, + 0x0000000706050301, 0x0000000706050300, 0x0000000007060503, + 0x0000070605020100, 0x0000000706050201, 0x0000000706050200, + 0x0000000007060502, 0x0000000706050100, 0x0000000007060501, + 0x0000000007060500, 0x0000000000070605, 0x0007060403020100, + 0x0000070604030201, 0x0000070604030200, 0x0000000706040302, + 0x0000070604030100, 0x0000000706040301, 0x0000000706040300, + 0x0000000007060403, 0x0000070604020100, 0x0000000706040201, + 0x0000000706040200, 0x0000000007060402, 0x0000000706040100, + 0x0000000007060401, 0x0000000007060400, 0x0000000000070604, + 0x0000070603020100, 0x0000000706030201, 0x0000000706030200, + 0x0000000007060302, 0x0000000706030100, 0x0000000007060301, + 0x0000000007060300, 0x0000000000070603, 0x0000000706020100, + 0x0000000007060201, 0x0000000007060200, 0x0000000000070602, + 0x0000000007060100, 0x0000000000070601, 0x0000000000070600, + 0x0000000000000706, 0x0007050403020100, 0x0000070504030201, + 0x0000070504030200, 0x0000000705040302, 0x0000070504030100, + 0x0000000705040301, 0x0000000705040300, 0x0000000007050403, + 0x0000070504020100, 0x0000000705040201, 0x0000000705040200, + 0x0000000007050402, 0x0000000705040100, 0x0000000007050401, + 0x0000000007050400, 0x0000000000070504, 0x0000070503020100, + 0x0000000705030201, 0x0000000705030200, 0x0000000007050302, + 0x0000000705030100, 0x0000000007050301, 0x0000000007050300, + 0x0000000000070503, 0x0000000705020100, 0x0000000007050201, + 0x0000000007050200, 0x0000000000070502, 0x0000000007050100, + 0x0000000000070501, 0x0000000000070500, 0x0000000000000705, + 0x0000070403020100, 0x0000000704030201, 0x0000000704030200, + 0x0000000007040302, 0x0000000704030100, 0x0000000007040301, + 0x0000000007040300, 0x0000000000070403, 0x0000000704020100, + 0x0000000007040201, 0x0000000007040200, 0x0000000000070402, + 0x0000000007040100, 0x0000000000070401, 0x0000000000070400, + 0x0000000000000704, 0x0000000703020100, 0x0000000007030201, + 0x0000000007030200, 0x0000000000070302, 0x0000000007030100, + 0x0000000000070301, 0x0000000000070300, 0x0000000000000703, + 0x0000000007020100, 0x0000000000070201, 0x0000000000070200, + 0x0000000000000702, 0x0000000000070100, 0x0000000000000701, + 0x0000000000000700, 0x0000000000000007, 0x0006050403020100, + 0x0000060504030201, 0x0000060504030200, 0x0000000605040302, + 0x0000060504030100, 0x0000000605040301, 0x0000000605040300, + 0x0000000006050403, 0x0000060504020100, 0x0000000605040201, + 0x0000000605040200, 0x0000000006050402, 0x0000000605040100, + 0x0000000006050401, 0x0000000006050400, 0x0000000000060504, + 0x0000060503020100, 0x0000000605030201, 0x0000000605030200, + 0x0000000006050302, 0x0000000605030100, 0x0000000006050301, + 0x0000000006050300, 0x0000000000060503, 0x0000000605020100, + 0x0000000006050201, 0x0000000006050200, 0x0000000000060502, + 0x0000000006050100, 0x0000000000060501, 0x0000000000060500, + 0x0000000000000605, 0x0000060403020100, 0x0000000604030201, + 0x0000000604030200, 0x0000000006040302, 0x0000000604030100, + 0x0000000006040301, 0x0000000006040300, 0x0000000000060403, + 0x0000000604020100, 0x0000000006040201, 0x0000000006040200, + 0x0000000000060402, 0x0000000006040100, 0x0000000000060401, + 0x0000000000060400, 0x0000000000000604, 0x0000000603020100, + 0x0000000006030201, 0x0000000006030200, 0x0000000000060302, + 0x0000000006030100, 0x0000000000060301, 0x0000000000060300, + 0x0000000000000603, 0x0000000006020100, 0x0000000000060201, + 0x0000000000060200, 0x0000000000000602, 0x0000000000060100, + 0x0000000000000601, 0x0000000000000600, 0x0000000000000006, + 0x0000050403020100, 0x0000000504030201, 0x0000000504030200, + 0x0000000005040302, 0x0000000504030100, 0x0000000005040301, + 0x0000000005040300, 0x0000000000050403, 0x0000000504020100, + 0x0000000005040201, 0x0000000005040200, 0x0000000000050402, + 0x0000000005040100, 0x0000000000050401, 0x0000000000050400, + 0x0000000000000504, 0x0000000503020100, 0x0000000005030201, + 0x0000000005030200, 0x0000000000050302, 0x0000000005030100, + 0x0000000000050301, 0x0000000000050300, 0x0000000000000503, + 0x0000000005020100, 0x0000000000050201, 0x0000000000050200, + 0x0000000000000502, 0x0000000000050100, 0x0000000000000501, + 0x0000000000000500, 0x0000000000000005, 0x0000000403020100, + 0x0000000004030201, 0x0000000004030200, 0x0000000000040302, + 0x0000000004030100, 0x0000000000040301, 0x0000000000040300, + 0x0000000000000403, 0x0000000004020100, 0x0000000000040201, + 0x0000000000040200, 0x0000000000000402, 0x0000000000040100, + 0x0000000000000401, 0x0000000000000400, 0x0000000000000004, + 0x0000000003020100, 0x0000000000030201, 0x0000000000030200, + 0x0000000000000302, 0x0000000000030100, 0x0000000000000301, + 0x0000000000000300, 0x0000000000000003, 0x0000000000020100, + 0x0000000000000201, 0x0000000000000200, 0x0000000000000002, + 0x0000000000000100, 0x0000000000000001, 0x0000000000000000, + 0x0000000000000000, +}; //static uint64_t thintable_epi8[256] + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_ICELAKE || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64 || SIMDJSON_IMPLEMENTATION_LSX || SIMDJSON_IMPLEMENTATION_LASX + +#endif // SIMDJSON_SRC_SIMDPRUNE_TABLES_CPP diff --git a/contrib/libs/simdjson/src/simdjson.cpp b/contrib/libs/simdjson/src/simdjson.cpp new file mode 100644 index 000000000000..101279525336 --- /dev/null +++ b/contrib/libs/simdjson/src/simdjson.cpp @@ -0,0 +1,50 @@ +#define SIMDJSON_SRC_SIMDJSON_CPP + +#include + +SIMDJSON_PUSH_DISABLE_UNUSED_WARNINGS + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define SIMDJSON_CONDITIONAL_INCLUDE + +#if SIMDJSON_IMPLEMENTATION_ARM64 +#include +#endif +#if SIMDJSON_IMPLEMENTATION_HASWELL +#include +#endif +#if SIMDJSON_IMPLEMENTATION_ICELAKE +#include +#endif +#if SIMDJSON_IMPLEMENTATION_PPC64 +#error #include +#endif +#if SIMDJSON_IMPLEMENTATION_WESTMERE +#include +#endif +#if SIMDJSON_IMPLEMENTATION_LSX +#error #include +#endif +#if SIMDJSON_IMPLEMENTATION_LASX +#error #include +#endif +#if SIMDJSON_IMPLEMENTATION_FALLBACK +#include +#endif +#undef SIMDJSON_CONDITIONAL_INCLUDE + +SIMDJSON_POP_DISABLE_UNUSED_WARNINGS + diff --git a/contrib/libs/simdjson/src/to_chars.cpp b/contrib/libs/simdjson/src/to_chars.cpp new file mode 100644 index 000000000000..ce71ff6cdb92 --- /dev/null +++ b/contrib/libs/simdjson/src/to_chars.cpp @@ -0,0 +1,954 @@ +#ifndef SIMDJSON_SRC_TO_CHARS_CPP +#define SIMDJSON_SRC_TO_CHARS_CPP + +#include + +#include +#include +#include +#include + +namespace simdjson { +namespace internal { +/*! +implements the Grisu2 algorithm for binary to decimal floating-point +conversion. +Adapted from JSON for Modern C++ + +This implementation is a slightly modified version of the reference +implementation which may be obtained from +http://florian.loitsch.com/publications (bench.tar.gz). +The code is distributed under the MIT license, Copyright (c) 2009 Florian +Loitsch. For a detailed description of the algorithm see: [1] Loitsch, "Printing +Floating-Point Numbers Quickly and Accurately with Integers", Proceedings of the +ACM SIGPLAN 2010 Conference on Programming Language Design and Implementation, +PLDI 2010 [2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and +Accurately", Proceedings of the ACM SIGPLAN 1996 Conference on Programming +Language Design and Implementation, PLDI 1996 +*/ +namespace dtoa_impl { + +template +Target reinterpret_bits(const Source source) { + static_assert(sizeof(Target) == sizeof(Source), "size mismatch"); + + Target target; + std::memcpy(&target, &source, sizeof(Source)); + return target; +} + +struct diyfp // f * 2^e +{ + static constexpr int kPrecision = 64; // = q + + std::uint64_t f = 0; + int e = 0; + + constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {} + + /*! + @brief returns x - y + @pre x.e == y.e and x.f >= y.f + */ + static diyfp sub(const diyfp &x, const diyfp &y) noexcept { + + return {x.f - y.f, x.e}; + } + + /*! + @brief returns x * y + @note The result is rounded. (Only the upper q bits are returned.) + */ + static diyfp mul(const diyfp &x, const diyfp &y) noexcept { + static_assert(kPrecision == 64, "internal error"); + + // Computes: + // f = round((x.f * y.f) / 2^q) + // e = x.e + y.e + q + + // Emulate the 64-bit * 64-bit multiplication: + // + // p = u * v + // = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi) + // = (u_lo v_lo ) + 2^32 ((u_lo v_hi ) + (u_hi v_lo )) + + // 2^64 (u_hi v_hi ) = (p0 ) + 2^32 ((p1 ) + (p2 )) + // + 2^64 (p3 ) = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + + // 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3 ) = + // (p0_lo ) + 2^32 (p0_hi + p1_lo + p2_lo ) + 2^64 (p1_hi + + // p2_hi + p3) = (p0_lo ) + 2^32 (Q ) + 2^64 (H ) = (p0_lo ) + + // 2^32 (Q_lo + 2^32 Q_hi ) + 2^64 (H ) + // + // (Since Q might be larger than 2^32 - 1) + // + // = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H) + // + // (Q_hi + H does not overflow a 64-bit int) + // + // = p_lo + 2^64 p_hi + + const std::uint64_t u_lo = x.f & 0xFFFFFFFFu; + const std::uint64_t u_hi = x.f >> 32u; + const std::uint64_t v_lo = y.f & 0xFFFFFFFFu; + const std::uint64_t v_hi = y.f >> 32u; + + const std::uint64_t p0 = u_lo * v_lo; + const std::uint64_t p1 = u_lo * v_hi; + const std::uint64_t p2 = u_hi * v_lo; + const std::uint64_t p3 = u_hi * v_hi; + + const std::uint64_t p0_hi = p0 >> 32u; + const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu; + const std::uint64_t p1_hi = p1 >> 32u; + const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu; + const std::uint64_t p2_hi = p2 >> 32u; + + std::uint64_t Q = p0_hi + p1_lo + p2_lo; + + // The full product might now be computed as + // + // p_hi = p3 + p2_hi + p1_hi + (Q >> 32) + // p_lo = p0_lo + (Q << 32) + // + // But in this particular case here, the full p_lo is not required. + // Effectively we only need to add the highest bit in p_lo to p_hi (and + // Q_hi + 1 does not overflow). + + Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up + + const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u); + + return {h, x.e + y.e + 64}; + } + + /*! + @brief normalize x such that the significand is >= 2^(q-1) + @pre x.f != 0 + */ + static diyfp normalize(diyfp x) noexcept { + + while ((x.f >> 63u) == 0) { + x.f <<= 1u; + x.e--; + } + + return x; + } + + /*! + @brief normalize x such that the result has the exponent E + @pre e >= x.e and the upper e - x.e bits of x.f must be zero. + */ + static diyfp normalize_to(const diyfp &x, + const int target_exponent) noexcept { + const int delta = x.e - target_exponent; + + return {x.f << delta, target_exponent}; + } +}; + +struct boundaries { + diyfp w; + diyfp minus; + diyfp plus; +}; + +/*! +Compute the (normalized) diyfp representing the input number 'value' and its +boundaries. +@pre value must be finite and positive +*/ +template boundaries compute_boundaries(FloatType value) { + + // Convert the IEEE representation into a diyfp. + // + // If v is denormal: + // value = 0.F * 2^(1 - bias) = ( F) * 2^(1 - bias - (p-1)) + // If v is normalized: + // value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1)) + + static_assert(std::numeric_limits::is_iec559, + "internal error: dtoa_short requires an IEEE-754 " + "floating-point implementation"); + + constexpr int kPrecision = + std::numeric_limits::digits; // = p (includes the hidden bit) + constexpr int kBias = + std::numeric_limits::max_exponent - 1 + (kPrecision - 1); + constexpr int kMinExp = 1 - kBias; + constexpr std::uint64_t kHiddenBit = std::uint64_t{1} + << (kPrecision - 1); // = 2^(p-1) + + using bits_type = typename std::conditional::type; + + const std::uint64_t bits = reinterpret_bits(value); + const std::uint64_t E = bits >> (kPrecision - 1); + const std::uint64_t F = bits & (kHiddenBit - 1); + + const bool is_denormal = E == 0; + const diyfp v = is_denormal + ? diyfp(F, kMinExp) + : diyfp(F + kHiddenBit, static_cast(E) - kBias); + + // Compute the boundaries m- and m+ of the floating-point value + // v = f * 2^e. + // + // Determine v- and v+, the floating-point predecessor and successor if v, + // respectively. + // + // v- = v - 2^e if f != 2^(p-1) or e == e_min (A) + // = v - 2^(e-1) if f == 2^(p-1) and e > e_min (B) + // + // v+ = v + 2^e + // + // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_ + // between m- and m+ round to v, regardless of how the input rounding + // algorithm breaks ties. + // + // ---+-------------+-------------+-------------+-------------+--- (A) + // v- m- v m+ v+ + // + // -----------------+------+------+-------------+-------------+--- (B) + // v- m- v m+ v+ + + const bool lower_boundary_is_closer = F == 0 && E > 1; + const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1); + const diyfp m_minus = lower_boundary_is_closer + ? diyfp(4 * v.f - 1, v.e - 2) // (B) + : diyfp(2 * v.f - 1, v.e - 1); // (A) + + // Determine the normalized w+ = m+. + const diyfp w_plus = diyfp::normalize(m_plus); + + // Determine w- = m- such that e_(w-) = e_(w+). + const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e); + + return {diyfp::normalize(v), w_minus, w_plus}; +} + +// Given normalized diyfp w, Grisu needs to find a (normalized) cached +// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies +// within a certain range [alpha, gamma] (Definition 3.2 from [1]) +// +// alpha <= e = e_c + e_w + q <= gamma +// +// or +// +// f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q +// <= f_c * f_w * 2^gamma +// +// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies +// +// 2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma +// +// or +// +// 2^(q - 2 + alpha) <= c * w < 2^(q + gamma) +// +// The choice of (alpha,gamma) determines the size of the table and the form of +// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well +// in practice: +// +// The idea is to cut the number c * w = f * 2^e into two parts, which can be +// processed independently: An integral part p1, and a fractional part p2: +// +// f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e +// = (f div 2^-e) + (f mod 2^-e) * 2^e +// = p1 + p2 * 2^e +// +// The conversion of p1 into decimal form requires a series of divisions and +// modulos by (a power of) 10. These operations are faster for 32-bit than for +// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be +// achieved by choosing +// +// -e >= 32 or e <= -32 := gamma +// +// In order to convert the fractional part +// +// p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ... +// +// into decimal form, the fraction is repeatedly multiplied by 10 and the digits +// d[-i] are extracted in order: +// +// (10 * p2) div 2^-e = d[-1] +// (10 * p2) mod 2^-e = d[-2] / 10^1 + ... +// +// The multiplication by 10 must not overflow. It is sufficient to choose +// +// 10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64. +// +// Since p2 = f mod 2^-e < 2^-e, +// +// -e <= 60 or e >= -60 := alpha + +constexpr int kAlpha = -60; +constexpr int kGamma = -32; + +struct cached_power // c = f * 2^e ~= 10^k +{ + std::uint64_t f; + int e; + int k; +}; + +/*! +For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached +power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c +satisfies (Definition 3.2 from [1]) + alpha <= e_c + e + q <= gamma. +*/ +inline cached_power get_cached_power_for_binary_exponent(int e) { + // Now + // + // alpha <= e_c + e + q <= gamma (1) + // ==> f_c * 2^alpha <= c * 2^e * 2^q + // + // and since the c's are normalized, 2^(q-1) <= f_c, + // + // ==> 2^(q - 1 + alpha) <= c * 2^(e + q) + // ==> 2^(alpha - e - 1) <= c + // + // If c were an exact power of ten, i.e. c = 10^k, one may determine k as + // + // k = ceil( log_10( 2^(alpha - e - 1) ) ) + // = ceil( (alpha - e - 1) * log_10(2) ) + // + // From the paper: + // "In theory the result of the procedure could be wrong since c is rounded, + // and the computation itself is approximated [...]. In practice, however, + // this simple function is sufficient." + // + // For IEEE double precision floating-point numbers converted into + // normalized diyfp's w = f * 2^e, with q = 64, + // + // e >= -1022 (min IEEE exponent) + // -52 (p - 1) + // -52 (p - 1, possibly normalize denormal IEEE numbers) + // -11 (normalize the diyfp) + // = -1137 + // + // and + // + // e <= +1023 (max IEEE exponent) + // -52 (p - 1) + // -11 (normalize the diyfp) + // = 960 + // + // This binary exponent range [-1137,960] results in a decimal exponent + // range [-307,324]. One does not need to store a cached power for each + // k in this range. For each such k it suffices to find a cached power + // such that the exponent of the product lies in [alpha,gamma]. + // This implies that the difference of the decimal exponents of adjacent + // table entries must be less than or equal to + // + // floor( (gamma - alpha) * log_10(2) ) = 8. + // + // (A smaller distance gamma-alpha would require a larger table.) + + // NB: + // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34. + + constexpr int kCachedPowersMinDecExp = -300; + constexpr int kCachedPowersDecStep = 8; + + static constexpr std::array kCachedPowers = {{ + {0xAB70FE17C79AC6CA, -1060, -300}, {0xFF77B1FCBEBCDC4F, -1034, -292}, + {0xBE5691EF416BD60C, -1007, -284}, {0x8DD01FAD907FFC3C, -980, -276}, + {0xD3515C2831559A83, -954, -268}, {0x9D71AC8FADA6C9B5, -927, -260}, + {0xEA9C227723EE8BCB, -901, -252}, {0xAECC49914078536D, -874, -244}, + {0x823C12795DB6CE57, -847, -236}, {0xC21094364DFB5637, -821, -228}, + {0x9096EA6F3848984F, -794, -220}, {0xD77485CB25823AC7, -768, -212}, + {0xA086CFCD97BF97F4, -741, -204}, {0xEF340A98172AACE5, -715, -196}, + {0xB23867FB2A35B28E, -688, -188}, {0x84C8D4DFD2C63F3B, -661, -180}, + {0xC5DD44271AD3CDBA, -635, -172}, {0x936B9FCEBB25C996, -608, -164}, + {0xDBAC6C247D62A584, -582, -156}, {0xA3AB66580D5FDAF6, -555, -148}, + {0xF3E2F893DEC3F126, -529, -140}, {0xB5B5ADA8AAFF80B8, -502, -132}, + {0x87625F056C7C4A8B, -475, -124}, {0xC9BCFF6034C13053, -449, -116}, + {0x964E858C91BA2655, -422, -108}, {0xDFF9772470297EBD, -396, -100}, + {0xA6DFBD9FB8E5B88F, -369, -92}, {0xF8A95FCF88747D94, -343, -84}, + {0xB94470938FA89BCF, -316, -76}, {0x8A08F0F8BF0F156B, -289, -68}, + {0xCDB02555653131B6, -263, -60}, {0x993FE2C6D07B7FAC, -236, -52}, + {0xE45C10C42A2B3B06, -210, -44}, {0xAA242499697392D3, -183, -36}, + {0xFD87B5F28300CA0E, -157, -28}, {0xBCE5086492111AEB, -130, -20}, + {0x8CBCCC096F5088CC, -103, -12}, {0xD1B71758E219652C, -77, -4}, + {0x9C40000000000000, -50, 4}, {0xE8D4A51000000000, -24, 12}, + {0xAD78EBC5AC620000, 3, 20}, {0x813F3978F8940984, 30, 28}, + {0xC097CE7BC90715B3, 56, 36}, {0x8F7E32CE7BEA5C70, 83, 44}, + {0xD5D238A4ABE98068, 109, 52}, {0x9F4F2726179A2245, 136, 60}, + {0xED63A231D4C4FB27, 162, 68}, {0xB0DE65388CC8ADA8, 189, 76}, + {0x83C7088E1AAB65DB, 216, 84}, {0xC45D1DF942711D9A, 242, 92}, + {0x924D692CA61BE758, 269, 100}, {0xDA01EE641A708DEA, 295, 108}, + {0xA26DA3999AEF774A, 322, 116}, {0xF209787BB47D6B85, 348, 124}, + {0xB454E4A179DD1877, 375, 132}, {0x865B86925B9BC5C2, 402, 140}, + {0xC83553C5C8965D3D, 428, 148}, {0x952AB45CFA97A0B3, 455, 156}, + {0xDE469FBD99A05FE3, 481, 164}, {0xA59BC234DB398C25, 508, 172}, + {0xF6C69A72A3989F5C, 534, 180}, {0xB7DCBF5354E9BECE, 561, 188}, + {0x88FCF317F22241E2, 588, 196}, {0xCC20CE9BD35C78A5, 614, 204}, + {0x98165AF37B2153DF, 641, 212}, {0xE2A0B5DC971F303A, 667, 220}, + {0xA8D9D1535CE3B396, 694, 228}, {0xFB9B7CD9A4A7443C, 720, 236}, + {0xBB764C4CA7A44410, 747, 244}, {0x8BAB8EEFB6409C1A, 774, 252}, + {0xD01FEF10A657842C, 800, 260}, {0x9B10A4E5E9913129, 827, 268}, + {0xE7109BFBA19C0C9D, 853, 276}, {0xAC2820D9623BF429, 880, 284}, + {0x80444B5E7AA7CF85, 907, 292}, {0xBF21E44003ACDD2D, 933, 300}, + {0x8E679C2F5E44FF8F, 960, 308}, {0xD433179D9C8CB841, 986, 316}, + {0x9E19DB92B4E31BA9, 1013, 324}, + }}; + + // This computation gives exactly the same results for k as + // k = ceil((kAlpha - e - 1) * 0.30102999566398114) + // for |e| <= 1500, but doesn't require floating-point operations. + // NB: log_10(2) ~= 78913 / 2^18 + const int f = kAlpha - e - 1; + const int k = (f * 78913) / (1 << 18) + static_cast(f > 0); + + const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / + kCachedPowersDecStep; + + const cached_power cached = kCachedPowers[static_cast(index)]; + + return cached; +} + +/*! +For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k. +For n == 0, returns 1 and sets pow10 := 1. +*/ +inline int find_largest_pow10(const std::uint32_t n, std::uint32_t &pow10) { + // LCOV_EXCL_START + if (n >= 1000000000) { + pow10 = 1000000000; + return 10; + } + // LCOV_EXCL_STOP + else if (n >= 100000000) { + pow10 = 100000000; + return 9; + } else if (n >= 10000000) { + pow10 = 10000000; + return 8; + } else if (n >= 1000000) { + pow10 = 1000000; + return 7; + } else if (n >= 100000) { + pow10 = 100000; + return 6; + } else if (n >= 10000) { + pow10 = 10000; + return 5; + } else if (n >= 1000) { + pow10 = 1000; + return 4; + } else if (n >= 100) { + pow10 = 100; + return 3; + } else if (n >= 10) { + pow10 = 10; + return 2; + } else { + pow10 = 1; + return 1; + } +} + +inline void grisu2_round(char *buf, int len, std::uint64_t dist, + std::uint64_t delta, std::uint64_t rest, + std::uint64_t ten_k) { + + // <--------------------------- delta ----> + // <---- dist ---------> + // --------------[------------------+-------------------]-------------- + // M- w M+ + // + // ten_k + // <------> + // <---- rest ----> + // --------------[------------------+----+--------------]-------------- + // w V + // = buf * 10^k + // + // ten_k represents a unit-in-the-last-place in the decimal representation + // stored in buf. + // Decrement buf by ten_k while this takes buf closer to w. + + // The tests are written in this order to avoid overflow in unsigned + // integer arithmetic. + + while (rest < dist && delta - rest >= ten_k && + (rest + ten_k < dist || dist - rest > rest + ten_k - dist)) { + buf[len - 1]--; + rest += ten_k; + } +} + +/*! +Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+. +M- and M+ must be normalized and share the same exponent -60 <= e <= -32. +*/ +inline void grisu2_digit_gen(char *buffer, int &length, int &decimal_exponent, + diyfp M_minus, diyfp w, diyfp M_plus) { + static_assert(kAlpha >= -60, "internal error"); + static_assert(kGamma <= -32, "internal error"); + + // Generates the digits (and the exponent) of a decimal floating-point + // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's + // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= + // gamma. + // + // <--------------------------- delta ----> + // <---- dist ---------> + // --------------[------------------+-------------------]-------------- + // M- w M+ + // + // Grisu2 generates the digits of M+ from left to right and stops as soon as + // V is in [M-,M+]. + + std::uint64_t delta = + diyfp::sub(M_plus, M_minus) + .f; // (significand of (M+ - M-), implicit exponent is e) + std::uint64_t dist = + diyfp::sub(M_plus, w) + .f; // (significand of (M+ - w ), implicit exponent is e) + + // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0): + // + // M+ = f * 2^e + // = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e + // = ((p1 ) * 2^-e + (p2 )) * 2^e + // = p1 + p2 * 2^e + + const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e); + + auto p1 = static_cast( + M_plus.f >> + -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.) + std::uint64_t p2 = M_plus.f & (one.f - 1); // p2 = f mod 2^-e + + // 1) + // + // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0] + + std::uint32_t pow10; + const int k = find_largest_pow10(p1, pow10); + + // 10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1) + // + // p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1)) + // = (d[k-1] ) * 10^(k-1) + (p1 mod 10^(k-1)) + // + // M+ = p1 + p2 * 2^e + // = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1)) + p2 * 2^e + // = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e + // = d[k-1] * 10^(k-1) + ( rest) * 2^e + // + // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0) + // + // p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0] + // + // but stop as soon as + // + // rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e + + int n = k; + while (n > 0) { + // Invariants: + // M+ = buffer * 10^n + (p1 + p2 * 2^e) (buffer = 0 for n = k) + // pow10 = 10^(n-1) <= p1 < 10^n + // + const std::uint32_t d = p1 / pow10; // d = p1 div 10^(n-1) + const std::uint32_t r = p1 % pow10; // r = p1 mod 10^(n-1) + // + // M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e + // = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e) + // + buffer[length++] = static_cast('0' + d); // buffer := buffer * 10 + d + // + // M+ = buffer * 10^(n-1) + (r + p2 * 2^e) + // + p1 = r; + n--; + // + // M+ = buffer * 10^n + (p1 + p2 * 2^e) + // pow10 = 10^n + // + + // Now check if enough digits have been generated. + // Compute + // + // p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e + // + // Note: + // Since rest and delta share the same exponent e, it suffices to + // compare the significands. + const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2; + if (rest <= delta) { + // V = buffer * 10^n, with M- <= V <= M+. + + decimal_exponent += n; + + // We may now just stop. But instead look if the buffer could be + // decremented to bring V closer to w. + // + // pow10 = 10^n is now 1 ulp in the decimal representation V. + // The rounding procedure works with diyfp's with an implicit + // exponent of e. + // + // 10^n = (10^n * 2^-e) * 2^e = ulp * 2^e + // + const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e; + grisu2_round(buffer, length, dist, delta, rest, ten_n); + + return; + } + + pow10 /= 10; + // + // pow10 = 10^(n-1) <= p1 < 10^n + // Invariants restored. + } + + // 2) + // + // The digits of the integral part have been generated: + // + // M+ = d[k-1]...d[1]d[0] + p2 * 2^e + // = buffer + p2 * 2^e + // + // Now generate the digits of the fractional part p2 * 2^e. + // + // Note: + // No decimal point is generated: the exponent is adjusted instead. + // + // p2 actually represents the fraction + // + // p2 * 2^e + // = p2 / 2^-e + // = d[-1] / 10^1 + d[-2] / 10^2 + ... + // + // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...) + // + // p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m + // + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...) + // + // using + // + // 10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e) + // = ( d) * 2^-e + ( r) + // + // or + // 10^m * p2 * 2^e = d + r * 2^e + // + // i.e. + // + // M+ = buffer + p2 * 2^e + // = buffer + 10^-m * (d + r * 2^e) + // = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e + // + // and stop as soon as 10^-m * r * 2^e <= delta * 2^e + + int m = 0; + for (;;) { + // Invariant: + // M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) + // * 2^e + // = buffer * 10^-m + 10^-m * (p2 ) + // * 2^e = buffer * 10^-m + 10^-m * (1/10 * (10 * p2) ) * 2^e = + // buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + + // (10*p2 mod 2^-e)) * 2^e + // + p2 *= 10; + const std::uint64_t d = p2 >> -one.e; // d = (10 * p2) div 2^-e + const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e + // + // M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e + // = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e)) + // = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e + // + buffer[length++] = static_cast('0' + d); // buffer := buffer * 10 + d + // + // M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e + // + p2 = r; + m++; + // + // M+ = buffer * 10^-m + 10^-m * p2 * 2^e + // Invariant restored. + + // Check if enough digits have been generated. + // + // 10^-m * p2 * 2^e <= delta * 2^e + // p2 * 2^e <= 10^m * delta * 2^e + // p2 <= 10^m * delta + delta *= 10; + dist *= 10; + if (p2 <= delta) { + break; + } + } + + // V = buffer * 10^-m, with M- <= V <= M+. + + decimal_exponent -= m; + + // 1 ulp in the decimal representation is now 10^-m. + // Since delta and dist are now scaled by 10^m, we need to do the + // same with ulp in order to keep the units in sync. + // + // 10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e + // + const std::uint64_t ten_m = one.f; + grisu2_round(buffer, length, dist, delta, p2, ten_m); + + // By construction this algorithm generates the shortest possible decimal + // number (Loitsch, Theorem 6.2) which rounds back to w. + // For an input number of precision p, at least + // + // N = 1 + ceil(p * log_10(2)) + // + // decimal digits are sufficient to identify all binary floating-point + // numbers (Matula, "In-and-Out conversions"). + // This implies that the algorithm does not produce more than N decimal + // digits. + // + // N = 17 for p = 53 (IEEE double precision) + // N = 9 for p = 24 (IEEE single precision) +} + +/*! +v = buf * 10^decimal_exponent +len is the length of the buffer (number of decimal digits) +The buffer must be large enough, i.e. >= max_digits10. +*/ +inline void grisu2(char *buf, int &len, int &decimal_exponent, diyfp m_minus, + diyfp v, diyfp m_plus) { + + // --------(-----------------------+-----------------------)-------- (A) + // m- v m+ + // + // --------------------(-----------+-----------------------)-------- (B) + // m- v m+ + // + // First scale v (and m- and m+) such that the exponent is in the range + // [alpha, gamma]. + + const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e); + + const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k + + // The exponent of the products is = v.e + c_minus_k.e + q and is in the range + // [alpha,gamma] + const diyfp w = diyfp::mul(v, c_minus_k); + const diyfp w_minus = diyfp::mul(m_minus, c_minus_k); + const diyfp w_plus = diyfp::mul(m_plus, c_minus_k); + + // ----(---+---)---------------(---+---)---------------(---+---)---- + // w- w w+ + // = c*m- = c*v = c*m+ + // + // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and + // w+ are now off by a small amount. + // In fact: + // + // w - v * 10^k < 1 ulp + // + // To account for this inaccuracy, add resp. subtract 1 ulp. + // + // --------+---[---------------(---+---)---------------]---+-------- + // w- M- w M+ w+ + // + // Now any number in [M-, M+] (bounds included) will round to w when input, + // regardless of how the input rounding algorithm breaks ties. + // + // And digit_gen generates the shortest possible such number in [M-, M+]. + // Note that this does not mean that Grisu2 always generates the shortest + // possible number in the interval (m-, m+). + const diyfp M_minus(w_minus.f + 1, w_minus.e); + const diyfp M_plus(w_plus.f - 1, w_plus.e); + + decimal_exponent = -cached.k; // = -(-k) = k + + grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus); +} + +/*! +v = buf * 10^decimal_exponent +len is the length of the buffer (number of decimal digits) +The buffer must be large enough, i.e. >= max_digits10. +*/ +template +void grisu2(char *buf, int &len, int &decimal_exponent, FloatType value) { + static_assert(diyfp::kPrecision >= std::numeric_limits::digits + 3, + "internal error: not enough precision"); + + // If the neighbors (and boundaries) of 'value' are always computed for + // double-precision numbers, all float's can be recovered using strtod (and + // strtof). However, the resulting decimal representations are not exactly + // "short". + // + // The documentation for 'std::to_chars' + // (https://en.cppreference.com/w/cpp/utility/to_chars) says "value is + // converted to a string as if by std::sprintf in the default ("C") locale" + // and since sprintf promotes float's to double's, I think this is exactly + // what 'std::to_chars' does. On the other hand, the documentation for + // 'std::to_chars' requires that "parsing the representation using the + // corresponding std::from_chars function recovers value exactly". That + // indicates that single precision floating-point numbers should be recovered + // using 'std::strtof'. + // + // NB: If the neighbors are computed for single-precision numbers, there is a + // single float + // (7.0385307e-26f) which can't be recovered using strtod. The resulting + // double precision value is off by 1 ulp. +#if 0 + const boundaries w = compute_boundaries(static_cast(value)); +#else + const boundaries w = compute_boundaries(value); +#endif + + grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus); +} + +/*! +@brief appends a decimal representation of e to buf +@return a pointer to the element following the exponent. +@pre -1000 < e < 1000 +*/ +inline char *append_exponent(char *buf, int e) { + + if (e < 0) { + e = -e; + *buf++ = '-'; + } else { + *buf++ = '+'; + } + + auto k = static_cast(e); + if (k < 10) { + // Always print at least two digits in the exponent. + // This is for compatibility with printf("%g"). + *buf++ = '0'; + *buf++ = static_cast('0' + k); + } else if (k < 100) { + *buf++ = static_cast('0' + k / 10); + k %= 10; + *buf++ = static_cast('0' + k); + } else { + *buf++ = static_cast('0' + k / 100); + k %= 100; + *buf++ = static_cast('0' + k / 10); + k %= 10; + *buf++ = static_cast('0' + k); + } + + return buf; +} + +/*! +@brief prettify v = buf * 10^decimal_exponent +If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point +notation. Otherwise it will be printed in exponential notation. +@pre min_exp < 0 +@pre max_exp > 0 +*/ +inline char *format_buffer(char *buf, int len, int decimal_exponent, + int min_exp, int max_exp) { + + const int k = len; + const int n = len + decimal_exponent; + + // v = buf * 10^(n-k) + // k is the length of the buffer (number of decimal digits) + // n is the position of the decimal point relative to the start of the buffer. + + if (k <= n && n <= max_exp) { + // digits[000] + // len <= max_exp + 2 + + std::memset(buf + k, '0', static_cast(n) - static_cast(k)); + // Make it look like a floating-point number (#362, #378) + buf[n + 0] = '.'; + buf[n + 1] = '0'; + return buf + (static_cast(n)) + 2; + } + + if (0 < n && n <= max_exp) { + // dig.its + // len <= max_digits10 + 1 + std::memmove(buf + (static_cast(n) + 1), buf + n, + static_cast(k) - static_cast(n)); + buf[n] = '.'; + return buf + (static_cast(k) + 1U); + } + + if (min_exp < n && n <= 0) { + // 0.[000]digits + // len <= 2 + (-min_exp - 1) + max_digits10 + + std::memmove(buf + (2 + static_cast(-n)), buf, + static_cast(k)); + buf[0] = '0'; + buf[1] = '.'; + std::memset(buf + 2, '0', static_cast(-n)); + return buf + (2U + static_cast(-n) + static_cast(k)); + } + + if (k == 1) { + // dE+123 + // len <= 1 + 5 + + buf += 1; + } else { + // d.igitsE+123 + // len <= max_digits10 + 1 + 5 + + std::memmove(buf + 2, buf + 1, static_cast(k) - 1); + buf[1] = '.'; + buf += 1 + static_cast(k); + } + + *buf++ = 'e'; + return append_exponent(buf, n - 1); +} + +} // namespace dtoa_impl + +/*! +The format of the resulting decimal representation is similar to printf's %g +format. Returns an iterator pointing past-the-end of the decimal representation. +@note The input number must be finite, i.e. NaN's and Inf's are not supported. +@note The buffer must be large enough. +@note The result is NOT null-terminated. +*/ +char *to_chars(char *first, const char *last, double value) { + static_cast(last); // maybe unused - fix warning + bool negative = std::signbit(value); + if (negative) { + value = -value; + *first++ = '-'; + } + + if (value == 0) // +-0 + { + *first++ = '0'; + // Make it look like a floating-point number (#362, #378) + *first++ = '.'; + *first++ = '0'; + return first; + } + // Compute v = buffer * 10^decimal_exponent. + // The decimal digits are stored in the buffer, which needs to be interpreted + // as an unsigned decimal integer. + // len is the length of the buffer, i.e. the number of decimal digits. + int len = 0; + int decimal_exponent = 0; + dtoa_impl::grisu2(first, len, decimal_exponent, value); + // Format the buffer like printf("%.*g", prec, value) + constexpr int kMinExp = -4; + constexpr int kMaxExp = std::numeric_limits::digits10; + + return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, + kMaxExp); +} +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_SRC_TO_CHARS_CPP \ No newline at end of file diff --git a/contrib/libs/simdjson/src/westmere.cpp b/contrib/libs/simdjson/src/westmere.cpp new file mode 100644 index 000000000000..538db42f898a --- /dev/null +++ b/contrib/libs/simdjson/src/westmere.cpp @@ -0,0 +1,174 @@ +#ifndef SIMDJSON_SRC_WESTMERE_CPP +#define SIMDJSON_SRC_WESTMERE_CPP + +#ifndef SIMDJSON_CONDITIONAL_INCLUDE +#include +#endif // SIMDJSON_CONDITIONAL_INCLUDE + +#include +#include + +#include +#include +#include +#include + +// +// Stage 1 +// + +namespace simdjson { +namespace westmere { + +simdjson_warn_unused error_code implementation::create_dom_parser_implementation( + size_t capacity, + size_t max_depth, + std::unique_ptr& dst +) const noexcept { + dst.reset( new (std::nothrow) dom_parser_implementation() ); + if (!dst) { return MEMALLOC; } + if (auto err = dst->set_capacity(capacity)) + return err; + if (auto err = dst->set_max_depth(max_depth)) + return err; + return SUCCESS; +} + +namespace { + +using namespace simd; + +simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64& in) { + // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why + // we can't use the generic lookup_16. + auto whitespace_table = simd8::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100); + + // The 6 operators (:,[]{}) have these values: + // + // , 2C + // : 3A + // [ 5B + // { 7B + // ] 5D + // } 7D + // + // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique. + // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then + // match it (against | 0x20). + // + // To prevent recognizing other characters, everything else gets compared with 0, which cannot + // match due to the | 0x20. + // + // NOTE: Due to the | 0x20, this ALSO treats and (control characters 0C and 1A) like , + // and :. This gets caught in stage 2, which checks the actual character to ensure the right + // operators are in the right places. + const auto op_table = simd8::repeat_16( + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B + ',', '}', 0, 0 // , = 2C, ] = 5D, } = 7D + ); + + // We compute whitespace and op separately. If the code later only use one or the + // other, given the fact that all functions are aggressively inlined, we can + // hope that useless computations will be omitted. This is namely case when + // minifying (we only need whitespace). + + + const uint64_t whitespace = in.eq({ + _mm_shuffle_epi8(whitespace_table, in.chunks[0]), + _mm_shuffle_epi8(whitespace_table, in.chunks[1]), + _mm_shuffle_epi8(whitespace_table, in.chunks[2]), + _mm_shuffle_epi8(whitespace_table, in.chunks[3]) + }); + // Turn [ and ] into { and } + const simd8x64 curlified{ + in.chunks[0] | 0x20, + in.chunks[1] | 0x20, + in.chunks[2] | 0x20, + in.chunks[3] | 0x20 + }; + const uint64_t op = curlified.eq({ + _mm_shuffle_epi8(op_table, in.chunks[0]), + _mm_shuffle_epi8(op_table, in.chunks[1]), + _mm_shuffle_epi8(op_table, in.chunks[2]), + _mm_shuffle_epi8(op_table, in.chunks[3]) + }); + return { whitespace, op }; +} + +simdjson_inline bool is_ascii(const simd8x64& input) { + return input.reduce_or().is_ascii(); +} + +simdjson_unused simdjson_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0 + simd8 is_third_byte = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); +} + +simdjson_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80 + simd8 is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80 + return is_third_byte | is_fourth_byte; +} + +} // unnamed namespace +} // namespace westmere +} // namespace simdjson + +// +// Stage 2 +// + +// +// Implementation-specific overrides +// + +namespace simdjson { +namespace westmere { + +simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept { + return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept { + this->buf = _buf; + this->len = _len; + return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming); +} + +simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return westmere::stage1::generic_validate_utf8(buf,len); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { + return stage2::tape_builder::parse_document(*this, _doc); +} + +simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { + return stage2::tape_builder::parse_document(*this, _doc); +} + +simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept { + return westmere::stringparsing::parse_string(src, dst, replacement_char); +} + +simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept { + return westmere::stringparsing::parse_wobbly_string(src, dst); +} + +simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { + auto error = stage1(_buf, _len, stage1_mode::regular); + if (error) { return error; } + return stage2(_doc); +} + +} // namespace westmere +} // namespace simdjson + +#include + +#endif // SIMDJSON_SRC_WESTMERE_CPP \ No newline at end of file diff --git a/contrib/libs/simdjson/ya.make b/contrib/libs/simdjson/ya.make new file mode 100644 index 000000000000..86d246d1a79f --- /dev/null +++ b/contrib/libs/simdjson/ya.make @@ -0,0 +1,35 @@ +# Generated by devtools/yamaker from nixpkgs 22.11. + +LIBRARY() + +LICENSE( + Apache-2.0 AND + BSD-3-Clause AND + MIT +) + +LICENSE_TEXTS(.yandex_meta/licenses.list.txt) + +VERSION(3.10.1) + +ORIGINAL_SOURCE(https://github.com/simdjson/simdjson/archive/v3.10.1.tar.gz) + +ADDINCL( + GLOBAL contrib/libs/simdjson/include + contrib/libs/simdjson/src +) + +NO_COMPILER_WARNINGS() + +NO_UTIL() + +CFLAGS( + -DSIMDJSON_AVX512_ALLOWED=1 + -DSIMDJSON_UTF8VALIDATION=1 +) + +SRCS( + src/simdjson.cpp +) + +END() diff --git a/ydb/ci/rightlib.txt b/ydb/ci/rightlib.txt index c413eb92a809..125e3e6a64e3 100644 --- a/ydb/ci/rightlib.txt +++ b/ydb/ci/rightlib.txt @@ -1 +1 @@ -10d8655dd385fe03395d60abfbb5903fcc87b2a4 +878b50bb2ed42fd7ce11c383bb2a0966c8281eed From 6fcb3bdd04afc21a39b65af3f50de6991e448709 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 16 Oct 2024 16:53:37 +0300 Subject: [PATCH 41/56] Shared reading: merge to q-stable-2024-07-08 (#10488) Co-authored-by: Pisarenko Grigoriy Co-authored-by: uzhastik Co-authored-by: Fiodar Miron <61616792+fedor-miron@users.noreply.github.com> Co-authored-by: yumkam Co-authored-by: Fiodar Miron --- .../libs/config/protos/row_dispatcher.proto | 9 +- .../fq/libs/control_plane_storage/ya.make | 7 +- ydb/core/fq/libs/init/init.cpp | 32 +- ydb/core/fq/libs/init/ya.make | 11 +- .../fq/libs/row_dispatcher/actors_factory.cpp | 6 +- .../fq/libs/row_dispatcher/actors_factory.h | 4 +- .../fq/libs/row_dispatcher/coordinator.cpp | 1 - .../fq/libs/row_dispatcher/json_filter.cpp | 84 +++- ydb/core/fq/libs/row_dispatcher/json_filter.h | 13 +- .../fq/libs/row_dispatcher/json_parser.cpp | 470 ++++++++---------- ydb/core/fq/libs/row_dispatcher/json_parser.h | 30 +- .../libs/row_dispatcher/leader_election.cpp | 4 + .../fq/libs/row_dispatcher/row_dispatcher.cpp | 30 +- .../fq/libs/row_dispatcher/row_dispatcher.h | 5 +- .../row_dispatcher/row_dispatcher_service.cpp | 8 +- .../row_dispatcher/row_dispatcher_service.h | 5 +- .../fq/libs/row_dispatcher/topic_session.cpp | 281 +++++++---- .../fq/libs/row_dispatcher/topic_session.h | 6 +- .../libs/row_dispatcher/ut/json_filter_ut.cpp | 50 +- .../libs/row_dispatcher/ut/json_parser_ut.cpp | 183 +++++-- .../row_dispatcher/ut/leader_election_ut.cpp | 23 +- .../row_dispatcher/ut/row_dispatcher_ut.cpp | 16 +- .../row_dispatcher/ut/topic_session_ut.cpp | 98 +++- ydb/core/fq/libs/row_dispatcher/ut/ya.make | 1 - ydb/core/fq/libs/row_dispatcher/ya.make | 9 +- .../dq/local_gateway/yql_dq_gateway_local.cpp | 18 +- .../dq/local_gateway/yql_dq_gateway_local.h | 4 +- .../pq/async_io/dq_pq_rd_read_actor.cpp | 162 ++++-- .../pq/async_io/dq_pq_rd_read_actor.h | 2 +- .../pq/async_io/dq_pq_read_actor.cpp | 22 +- .../providers/pq/async_io/dq_pq_read_actor.h | 4 +- .../pq/expr_nodes/yql_pq_expr_nodes.json | 3 +- .../yql/providers/pq/gateway/dummy/ya.make | 1 + .../pq/gateway/dummy/yql_pq_dummy_gateway.cpp | 9 + .../pq/gateway/dummy/yql_pq_dummy_gateway.h | 14 +- .../dummy/yql_pq_file_topic_client.cpp | 314 ++++++++++++ .../gateway/dummy/yql_pq_file_topic_client.h | 36 ++ .../pq/gateway/native/yql_pq_gateway.cpp | 6 + .../yql/providers/pq/provider/ut/ya.make | 41 ++ .../providers/pq/provider/ut/yql_pq_ut.cpp | 231 +++++++++ ydb/library/yql/providers/pq/provider/ya.make | 4 + .../provider/yql_pq_datasource_type_ann.cpp | 2 +- .../pq/provider/yql_pq_dq_integration.cpp | 38 +- .../providers/pq/provider/yql_pq_gateway.h | 5 + .../pq/provider/yql_pq_logical_opt.cpp | 115 +++-- .../pq/provider/yql_pq_topic_client.h | 89 ++++ ydb/library/yql/tools/dq/worker_node/main.cpp | 12 +- ydb/library/yql/tools/dq/worker_node/ya.make | 1 + ydb/library/yql/tools/dqrun/dqrun.cpp | 82 ++- ydb/library/yql/tools/dqrun/examples/fq.conf | 21 + .../yql/tools/dqrun/examples/gateways.conf | 15 + ydb/library/yql/tools/dqrun/ya.make | 3 +- ydb/library/yql/tools/mrrun/mrrun.cpp | 12 +- ydb/library/yql/tools/mrrun/ya.make | 1 + .../pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp | 75 ++- ydb/tests/fq/pq_async_io/ut/ya.make | 1 + ydb/tests/fq/pq_async_io/ut_helpers.cpp | 16 + ydb/tests/fq/pq_async_io/ut_helpers.h | 1 + ydb/tests/fq/pq_async_io/ya.make | 4 + ydb/tests/fq/yds/test_row_dispatcher.py | 144 +++++- 60 files changed, 2158 insertions(+), 736 deletions(-) create mode 100644 ydb/library/yql/providers/pq/gateway/dummy/yql_pq_file_topic_client.cpp create mode 100644 ydb/library/yql/providers/pq/gateway/dummy/yql_pq_file_topic_client.h create mode 100644 ydb/library/yql/providers/pq/provider/ut/ya.make create mode 100644 ydb/library/yql/providers/pq/provider/ut/yql_pq_ut.cpp create mode 100644 ydb/library/yql/providers/pq/provider/yql_pq_topic_client.h create mode 100644 ydb/library/yql/tools/dqrun/examples/fq.conf diff --git a/ydb/core/fq/libs/config/protos/row_dispatcher.proto b/ydb/core/fq/libs/config/protos/row_dispatcher.proto index 10ca10285ea0..0607f63dd45c 100644 --- a/ydb/core/fq/libs/config/protos/row_dispatcher.proto +++ b/ydb/core/fq/libs/config/protos/row_dispatcher.proto @@ -11,13 +11,20 @@ import "ydb/core/fq/libs/config/protos/storage.proto"; message TRowDispatcherCoordinatorConfig { TYdbStorageConfig Database = 1; string CoordinationNodePath = 2; + bool LocalMode = 3; // Use only local row_dispatcher. } + +message TJsonParserConfig { + uint64 BatchSizeBytes = 1; + uint64 BatchCreationTimeoutMs = 2; +} + message TRowDispatcherConfig { bool Enabled = 1; uint64 TimeoutBeforeStartSessionSec = 2; uint64 SendStatusPeriodSec = 3; uint64 MaxSessionUsedMemory = 4; bool WithoutConsumer = 5; + TJsonParserConfig JsonParser = 7; TRowDispatcherCoordinatorConfig Coordinator = 6; - } diff --git a/ydb/core/fq/libs/control_plane_storage/ya.make b/ydb/core/fq/libs/control_plane_storage/ya.make index f3746eb34b28..8bb6fd73ed52 100644 --- a/ydb/core/fq/libs/control_plane_storage/ya.make +++ b/ydb/core/fq/libs/control_plane_storage/ya.make @@ -20,6 +20,7 @@ PEERDIR( library/cpp/lwtrace library/cpp/protobuf/interop ydb/core/base + ydb/core/external_sources ydb/core/fq/libs/actors/logging ydb/core/fq/libs/common ydb/core/fq/libs/config @@ -33,13 +34,13 @@ PEERDIR( ydb/core/fq/libs/shared_resources ydb/core/fq/libs/ydb ydb/core/mon + ydb/library/db_pool ydb/library/security + ydb/library/yql/providers/s3/path_generator + ydb/library/yql/public/issue ydb/public/api/protos ydb/public/sdk/cpp/client/ydb_scheme ydb/public/sdk/cpp/client/ydb_table - ydb/library/db_pool - ydb/library/yql/providers/s3/path_generator - ydb/library/yql/public/issue ) YQL_LAST_ABI_VERSION() diff --git a/ydb/core/fq/libs/init/init.cpp b/ydb/core/fq/libs/init/init.cpp index bb071a5a618d..d1cce22b648f 100644 --- a/ydb/core/fq/libs/init/init.cpp +++ b/ydb/core/fq/libs/init/init.cpp @@ -44,11 +44,8 @@ #include #include #include +#include #include -#include -#include -#include -#include #include @@ -158,7 +155,6 @@ void Init( TVector compNodeFactories = { NYql::GetCommonDqFactory(), - NYql::GetDqYdbFactory(yqSharedResources->UserSpaceYdbDriver), NKikimr::NMiniKQL::GetYqlFactory() }; @@ -166,8 +162,7 @@ void Init( NKikimr::NMiniKQL::TComputationNodeFactory dqCompFactory = NKikimr::NMiniKQL::GetCompositeWithBuiltinFactory(std::move(compNodeFactories)); NYql::TTaskTransformFactory dqTaskTransformFactory = NYql::CreateCompositeTaskTransformFactory({ - NYql::CreateCommonDqTaskTransformFactory(), - NYql::CreateYdbDqTaskTransformFactory() + NYql::CreateCommonDqTaskTransformFactory() }); auto asyncIoFactory = MakeIntrusive(); @@ -195,14 +190,21 @@ void Init( } if (protoConfig.GetRowDispatcher().GetEnabled()) { + NYql::TPqGatewayServices pqServices( + yqSharedResources->UserSpaceYdbDriver, + nullptr, + nullptr, + std::make_shared(), + nullptr); + auto rowDispatcher = NFq::NewRowDispatcherService( protoConfig.GetRowDispatcher(), - protoConfig.GetCommon(), NKikimr::CreateYdbCredentialsProviderFactory, yqSharedResources, credentialsFactory, tenant, - yqCounters->GetSubgroup("subsystem", "row_dispatcher")); + yqCounters->GetSubgroup("subsystem", "row_dispatcher"), + CreatePqNativeGateway(pqServices)); actorRegistrator(NFq::RowDispatcherServiceActorId(), rowDispatcher.release()); } @@ -225,8 +227,16 @@ void Init( } RegisterDqInputTransformLookupActorFactory(*asyncIoFactory); - RegisterDqPqReadActorFactory(*asyncIoFactory, yqSharedResources->UserSpaceYdbDriver, credentialsFactory, yqCounters->GetSubgroup("subsystem", "DqSourceTracker")); - RegisterYdbReadActorFactory(*asyncIoFactory, yqSharedResources->UserSpaceYdbDriver, credentialsFactory); + + NYql::TPqGatewayServices pqServices( + yqSharedResources->UserSpaceYdbDriver, + pqCmConnections, + credentialsFactory, + std::make_shared(protoConfig.GetGateways().GetPq()), + appData->FunctionRegistry + ); + RegisterDqPqReadActorFactory(*asyncIoFactory, yqSharedResources->UserSpaceYdbDriver, credentialsFactory, NYql::CreatePqNativeGateway(std::move(pqServices)), + yqCounters->GetSubgroup("subsystem", "DqSourceTracker")); s3ActorsFactory->RegisterS3ReadActorFactory(*asyncIoFactory, credentialsFactory, httpGateway, s3HttpRetryPolicy, readActorFactoryCfg, yqCounters->GetSubgroup("subsystem", "S3ReadActor"), protoConfig.GetGateways().GetS3().GetAllowLocalFiles()); diff --git a/ydb/core/fq/libs/init/ya.make b/ydb/core/fq/libs/init/ya.make index 857052758169..3ea6b5350874 100644 --- a/ydb/core/fq/libs/init/ya.make +++ b/ydb/core/fq/libs/init/ya.make @@ -5,8 +5,6 @@ SRCS( ) PEERDIR( - ydb/library/actors/core - ydb/library/actors/http ydb/core/base ydb/core/fq/libs/actors ydb/core/fq/libs/audit @@ -28,15 +26,16 @@ PEERDIR( ydb/core/fq/libs/shared_resources ydb/core/fq/libs/test_connection ydb/core/protos + ydb/library/actors/core + ydb/library/actors/http ydb/library/folder_service ydb/library/folder_service/proto ydb/library/security - ydb/library/yql/minikql/comp_nodes - ydb/library/yql/utils/actor_log ydb/library/yql/dq/actors/compute ydb/library/yql/dq/actors/input_transforms ydb/library/yql/dq/comp_nodes ydb/library/yql/dq/transform + ydb/library/yql/minikql/comp_nodes ydb/library/yql/providers/common/comp_nodes ydb/library/yql/providers/common/metrics ydb/library/yql/providers/dq/actors @@ -57,9 +56,7 @@ PEERDIR( ydb/library/yql/providers/solomon/gateway ydb/library/yql/providers/solomon/proto ydb/library/yql/providers/solomon/provider - ydb/library/yql/providers/ydb/actors - ydb/library/yql/providers/ydb/comp_nodes - ydb/library/yql/providers/ydb/provider + ydb/library/yql/utils/actor_log ) YQL_LAST_ABI_VERSION() diff --git a/ydb/core/fq/libs/row_dispatcher/actors_factory.cpp b/ydb/core/fq/libs/row_dispatcher/actors_factory.cpp index b3b4d8517c75..287df079ce51 100644 --- a/ydb/core/fq/libs/row_dispatcher/actors_factory.cpp +++ b/ydb/core/fq/libs/row_dispatcher/actors_factory.cpp @@ -15,7 +15,8 @@ struct TActorFactory : public IActorFactory { ui32 partitionId, NYdb::TDriver driver, std::shared_ptr credentialsProviderFactory, - const ::NMonitoring::TDynamicCounterPtr& counters) const override { + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway) const override { auto actorPtr = NFq::NewTopicSession( topicPath, @@ -24,7 +25,8 @@ struct TActorFactory : public IActorFactory { partitionId, std::move(driver), credentialsProviderFactory, - counters + counters, + pqGateway ); return NActors::TlsActivationContext->ExecutorThread.RegisterActor(actorPtr.release(), NActors::TMailboxType::HTSwap, Max()); } diff --git a/ydb/core/fq/libs/row_dispatcher/actors_factory.h b/ydb/core/fq/libs/row_dispatcher/actors_factory.h index 6cc718b41cde..c222522310d0 100644 --- a/ydb/core/fq/libs/row_dispatcher/actors_factory.h +++ b/ydb/core/fq/libs/row_dispatcher/actors_factory.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace NFq::NRowDispatcher { @@ -17,7 +18,8 @@ struct IActorFactory : public TThrRefBase { ui32 partitionId, NYdb::TDriver driver, std::shared_ptr credentialsProviderFactory, - const ::NMonitoring::TDynamicCounterPtr& counters) const = 0; + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway) const = 0; }; IActorFactory::TPtr CreateActorFactory(); diff --git a/ydb/core/fq/libs/row_dispatcher/coordinator.cpp b/ydb/core/fq/libs/row_dispatcher/coordinator.cpp index dfc483ec939d..75358bcedd9d 100644 --- a/ydb/core/fq/libs/row_dispatcher/coordinator.cpp +++ b/ydb/core/fq/libs/row_dispatcher/coordinator.cpp @@ -168,7 +168,6 @@ void TActorCoordinator::AddRowDispatcher(NActors::TActorId actorId, bool isLocal void TActorCoordinator::Handle(NActors::TEvents::TEvPing::TPtr& ev) { LOG_ROW_DISPATCHER_TRACE("TEvPing received, " << ev->Sender); AddRowDispatcher(ev->Sender, false); - PrintInternalState(); LOG_ROW_DISPATCHER_TRACE("Send TEvPong to " << ev->Sender); Send(ev->Sender, new NActors::TEvents::TEvPong(), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession); } diff --git a/ydb/core/fq/libs/row_dispatcher/json_filter.cpp b/ydb/core/fq/libs/row_dispatcher/json_filter.cpp index 8b7d46a690f2..04cf6771118b 100644 --- a/ydb/core/fq/libs/row_dispatcher/json_filter.cpp +++ b/ydb/core/fq/libs/row_dispatcher/json_filter.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -15,11 +16,24 @@ using TCallback = NFq::TJsonFilter::TCallback; const char* OffsetFieldName = "_offset"; TString LogPrefix = "JsonFilter: "; +NYT::TNode CreateTypeNode(const TString& fieldType) { + return NYT::TNode::CreateList() + .Add("DataType") + .Add(fieldType); +} + void AddField(NYT::TNode& node, const TString& fieldName, const TString& fieldType) { node.Add( NYT::TNode::CreateList() .Add(fieldName) - .Add(NYT::TNode::CreateList().Add("DataType").Add(fieldType)) + .Add(CreateTypeNode(fieldType)) + ); +} + +void AddOptionalField(NYT::TNode& node, const TString& fieldName, const TString& fieldType) { + node.Add(NYT::TNode::CreateList() + .Add(fieldName) + .Add(NYT::TNode::CreateList().Add("OptionalType").Add(CreateTypeNode(fieldType))) ); } @@ -27,7 +41,7 @@ NYT::TNode MakeInputSchema(const TVector& columns) { auto structMembers = NYT::TNode::CreateList(); AddField(structMembers, OffsetFieldName, "Uint64"); for (const auto& col : columns) { - AddField(structMembers, col, "String"); + AddOptionalField(structMembers, col, "String"); } return NYT::TNode::CreateList().Add("StructType").Add(std::move(structMembers)); } @@ -53,7 +67,7 @@ class TFilterInputSpec : public NYql::NPureCalc::TInputSpecBase { TVector Schemas; }; -class TFilterInputConsumer : public NYql::NPureCalc::IConsumer>> { +class TFilterInputConsumer : public NYql::NPureCalc::IConsumer&, const TVector>&>> { public: TFilterInputConsumer( const TFilterInputSpec& spec, @@ -91,28 +105,33 @@ class TFilterInputConsumer : public NYql::NPureCalc::IConsumer> value) override { + void OnObject(std::pair&, const TVector>&> values) override { + Y_ENSURE(FieldsPositions.size() == values.second.size()); + NKikimr::NMiniKQL::TThrowingBindTerminator bind; - with_lock (Worker->GetScopedAlloc()) { auto& holderFactory = Worker->GetGraph().GetHolderFactory(); - NYql::NUdf::TUnboxedValue* items = nullptr; - NYql::NUdf::TUnboxedValue result = Cache.NewArray( - holderFactory, - static_cast(value.second.size() + 1), - items); - - items[OffsetPosition] = NYql::NUdf::TUnboxedValuePod(value.first); + // TODO: use blocks here + for (size_t rowId = 0; rowId < values.second.front().size(); ++rowId) { + NYql::NUdf::TUnboxedValue* items = nullptr; - Y_ENSURE(FieldsPositions.size() == value.second.size()); + NYql::NUdf::TUnboxedValue result = Cache.NewArray( + holderFactory, + static_cast(values.second.size() + 1), + items); - size_t i = 0; - for (const auto& v : value.second) { - NYql::NUdf::TStringValue str(v); - items[FieldsPositions[i++]] = NYql::NUdf::TUnboxedValuePod(std::move(str)); + items[OffsetPosition] = NYql::NUdf::TUnboxedValuePod(values.first[rowId]); + + size_t fieldId = 0; + for (const auto& column : values.second) { + items[FieldsPositions[fieldId++]] = column[rowId].data() // Check that std::string_view was initialized in json_parser + ? NKikimr::NMiniKQL::MakeString(column[rowId]).MakeOptional() + : NKikimr::NUdf::TUnboxedValuePod(); + } + + Worker->Push(std::move(result)); } - Worker->Push(std::move(result)); } } @@ -196,7 +215,7 @@ struct NYql::NPureCalc::TInputSpecTraits { static constexpr bool IsPartial = false; static constexpr bool SupportPushStreamMode = true; - using TConsumerType = THolder>>>; + using TConsumerType = THolder&, const TVector>&>>>; static TConsumerType MakeConsumer( const TFilterInputSpec& spec, @@ -238,8 +257,9 @@ class TJsonFilter::TImpl { LOG_ROW_DISPATCHER_DEBUG("Program created"); } - void Push(ui64 offset, const TList& value) { - InputConsumer->OnObject(std::make_pair(offset, value)); + void Push(const TVector& offsets, const TVector>& values) { + Y_ENSURE(values, "Expected non empty schema"); + InputConsumer->OnObject(std::make_pair(offsets, values)); } TString GetSql() const { @@ -253,7 +273,19 @@ class TJsonFilter::TImpl { Y_ABORT_UNLESS(columnNames.size() == columnTypes.size()); str << OffsetFieldName << ", "; for (size_t i = 0; i < columnNames.size(); ++i) { - str << "CAST(" << columnNames[i] << " as " << columnTypes[i] << ") as " << columnNames[i] << ((i != columnNames.size() - 1) ? "," : ""); + TString columnType = columnTypes[i]; + if (columnType == "Json") { + columnType = "String"; + } else if (columnType == "Optional") { + columnType = "Optional"; + } + + if (columnType.StartsWith("Optional")) { + str << "IF(" << columnNames[i] << " IS NOT NULL, Unwrap(CAST(" << columnNames[i] << " as " << columnType << ")), NULL)"; + } else { + str << "Unwrap(CAST(" << columnNames[i] << " as " << columnType << "))"; + } + str << " as " << columnNames[i] << ((i != columnNames.size() - 1) ? "," : ""); } str << " FROM Input;\n"; str << "$filtered = SELECT * FROM $fields " << whereFilter << ";\n"; @@ -266,7 +298,7 @@ class TJsonFilter::TImpl { private: THolder> Program; - THolder>>> InputConsumer; + THolder&, const TVector>&>>> InputConsumer; const TString Sql; }; @@ -280,9 +312,9 @@ TJsonFilter::TJsonFilter( TJsonFilter::~TJsonFilter() { } - -void TJsonFilter::Push(ui64 offset, const TList& value) { - Impl->Push(offset, value); + +void TJsonFilter::Push(const TVector& offsets, const TVector>& values) { + Impl->Push(offsets, values); } TString TJsonFilter::GetSql() { diff --git a/ydb/core/fq/libs/row_dispatcher/json_filter.h b/ydb/core/fq/libs/row_dispatcher/json_filter.h index f1694a277fbb..f3435763ce3e 100644 --- a/ydb/core/fq/libs/row_dispatcher/json_filter.h +++ b/ydb/core/fq/libs/row_dispatcher/json_filter.h @@ -1,23 +1,24 @@ - #pragma once -namespace NFq { - #include #include +namespace NFq { + class TJsonFilter { public: using TCallback = std::function; - + public: TJsonFilter( - const TVector& columns, + const TVector& columns, const TVector& types, const TString& whereFilter, TCallback callback); + ~TJsonFilter(); - void Push(ui64 offset, const TList& value); + + void Push(const TVector& offsets, const TVector>& values); TString GetSql(); private: diff --git a/ydb/core/fq/libs/row_dispatcher/json_parser.cpp b/ydb/core/fq/libs/row_dispatcher/json_parser.cpp index 84ca3018b509..14c807fc0226 100644 --- a/ydb/core/fq/libs/row_dispatcher/json_parser.cpp +++ b/ydb/core/fq/libs/row_dispatcher/json_parser.cpp @@ -1,337 +1,287 @@ -#include +#include "json_parser.h" -#include -#include -#include -#include #include +#include + +#include namespace { -using TCallback = NFq::TJsonParser::TCallback; -using TInputConsumerArg = std::pair; -const char* OffsetFieldName = "_offset"; TString LogPrefix = "JsonParser: "; -void AddField(NYT::TNode& node, const TString& fieldName, const TString& fieldType) { - node.Add( - NYT::TNode::CreateList() - .Add(fieldName) - .Add(NYT::TNode::CreateList().Add("DataType").Add(fieldType)) - ); -} +struct TJsonParserBuffer { + size_t NumberValues = 0; + bool Finished = false; + TInstant CreationStartTime = TInstant::Now(); + TVector Offsets = {}; -NYT::TNode MakeInputSchema() { - auto structMembers = NYT::TNode::CreateList(); - AddField(structMembers, OffsetFieldName, "Uint64"); - AddField(structMembers, "data", "String"); - return NYT::TNode::CreateList().Add("StructType").Add(std::move(structMembers)); -} - -NYT::TNode MakeOutputSchema(const TVector& columns) { - auto structMembers = NYT::TNode::CreateList(); - AddField(structMembers, OffsetFieldName, "Uint64"); - for (const auto& col : columns) { - AddField(structMembers, col, "String"); + bool IsReady() const { + return !Finished && NumberValues > 0; } - return NYT::TNode::CreateList().Add("StructType").Add(std::move(structMembers)); -} -class TParserInputConsumer : public NYql::NPureCalc::IConsumer { -public: - explicit TParserInputConsumer(NYql::NPureCalc::TWorkerHolder worker) - : Worker(std::move(worker)) { + size_t GetSize() const { + return Values.size(); } - ~TParserInputConsumer() override { - with_lock(Worker->GetScopedAlloc()) { - Cache.Clear(); - } + void Reserve(size_t size, size_t numberValues) { + Values.reserve(2 * (size + simdjson::SIMDJSON_PADDING)); + Offsets.reserve(numberValues); } - void OnObject(std::pair value) override { - NKikimr::NMiniKQL::TThrowingBindTerminator bind; - - with_lock (Worker->GetScopedAlloc()) { - auto& holderFactory = Worker->GetGraph().GetHolderFactory(); - NYql::NUdf::TUnboxedValue* items = nullptr; - - NYql::NUdf::TUnboxedValue result = Cache.NewArray( - holderFactory, - static_cast(2), - items); - - items[0] = NYql::NUdf::TUnboxedValuePod(value.first); - NYql::NUdf::TStringValue str(value.second.Size()); - std::memcpy(str.Data(), value.second.Data(), value.second.Size()); - items[1] = NYql::NUdf::TUnboxedValuePod(std::move(str)); - Worker->Push(std::move(result)); + void AddMessages(const TVector& messages) { + Y_ENSURE(!Finished, "Cannot add messages into finished buffer"); + + size_t messagesSize = 0; + for (const auto& message : messages) { + messagesSize += message.GetData().size(); } - } - void OnFinish() override { - NKikimr::NMiniKQL::TBindTerminator bind(Worker->GetGraph().GetTerminator()); - with_lock(Worker->GetScopedAlloc()) { - Worker->OnFinish(); + NumberValues += messages.size(); + Reserve(Values.size() + messagesSize, NumberValues); + for (const auto& message : messages) { + Values << message.GetData(); + Offsets.emplace_back(message.GetOffset()); } } -private: - NYql::NPureCalc::TWorkerHolder Worker; - NKikimr::NMiniKQL::TPlainContainerCache Cache; -}; - - -class TParserInputSpec : public NYql::NPureCalc::TInputSpecBase { -public: - TParserInputSpec() { - Schemas = {MakeInputSchema()}; + std::string_view AddHolder(std::string_view value) { + Y_ENSURE(Values.size() + value.size() <= Values.capacity(), "Requested too large holders"); + const size_t startPos = Values.size(); + Values << value; + return std::string_view(Values).substr(startPos, value.length()); } - const TVector& GetSchemas() const override { - return Schemas; + std::pair Finish() { + Y_ENSURE(!Finished, "Cannot finish buffer twice"); + Finished = true; + Values << TString(simdjson::SIMDJSON_PADDING, ' '); + Values.reserve(2 * Values.size()); + return {Values.data(), Values.size()}; } -private: - TVector Schemas; -}; - - -class TParserOutputConsumer: public NYql::NPureCalc::IConsumer>> { -public: - TParserOutputConsumer(TCallback callback) - : Callback(callback) { + void Clear() { + Y_ENSURE(Finished, "Cannot clear not finished buffer"); + NumberValues = 0; + Finished = false; + CreationStartTime = TInstant::Now(); + Values.clear(); + Offsets.clear(); } - void OnObject(std::pair> value) override { - Callback(value.first, std::move(value.second)); - } - - void OnFinish() override { - Y_UNREACHABLE(); - } private: - TCallback Callback; + TStringBuilder Values = {}; }; -class TParserOutputSpec: public NYql::NPureCalc::TOutputSpecBase { -public: - explicit TParserOutputSpec(const NYT::TNode& schema) - : Schema(schema) - {} - -public: - const NYT::TNode& GetSchema() const override { - return Schema; - } +} // anonymous namespace -private: - NYT::TNode Schema; -}; +namespace NFq { -struct TFieldsMapping{ - TVector FieldsPositions; - size_t OffsetPosition; +//// TJsonParser - TFieldsMapping(const NYT::TNode& schema, const NKikimr::NMiniKQL::TType* outputType) { - THashMap outputPositions; - Y_ENSURE(outputType->IsStruct()); - const auto structType = static_cast(outputType); - const auto count = structType->GetMembersCount(); +class TJsonParser::TImpl { + struct TColumnDescription { + std::string Name; + TString Type; + }; - for (ui32 i = 1; i < count; ++i) { // 0 index - OffsetFieldName - const auto name = structType->GetMemberName(i); - outputPositions[name] = i; +public: + TImpl(const TVector& columns, const TVector& types, ui64 batchSize, TDuration batchCreationTimeout) + : BatchSize(batchSize) + , BatchCreationTimeout(batchCreationTimeout) + , ParsedValues(columns.size()) + { + Y_ENSURE(columns.size() == types.size(), "Number of columns and types should by equal"); + LOG_ROW_DISPATCHER_INFO("Simdjson active implementation " << simdjson::get_active_implementation()->name()); + + Columns.reserve(columns.size()); + for (size_t i = 0; i < columns.size(); i++) { + Columns.emplace_back(TColumnDescription{ + .Name = columns[i], + .Type = SkipOptional(types[i]) + }); } - const auto& fields = schema[1]; - Y_ENSURE(fields.IsList()); - Y_ENSURE(count == fields.Size()); - for (size_t i = 0; i < fields.Size(); ++i) { - auto name = fields[i][0].AsString(); - if (name == OffsetFieldName) { - OffsetPosition = i; - continue; - } - FieldsPositions.push_back(outputPositions[name]); + ColumnsIndex.reserve(columns.size()); + for (size_t i = 0; i < columns.size(); i++) { + ColumnsIndex.emplace(std::string_view(Columns[i].Name), i); } - } -}; -class TParserPushRelayImpl: public NYql::NPureCalc::IConsumer { -public: - TParserPushRelayImpl(const TParserOutputSpec& outputSpec, NYql::NPureCalc::IPushStreamWorker* worker, THolder>>> underlying) - : Underlying(std::move(underlying)) - , Worker(worker) - , FieldsMapping(outputSpec.GetSchema(), Worker->GetOutputType()) - { } - -public: - void OnObject(const NYql::NUdf::TUnboxedValue* value) override { - auto unguard = Unguard(Worker->GetScopedAlloc()); - TList result; - - Y_ENSURE(value->GetListLength() == FieldsMapping.FieldsPositions.size() + 1); - ui64 offset = value->GetElement(FieldsMapping.OffsetPosition).Get(); - - for (auto pos : FieldsMapping.FieldsPositions) { - const auto& cell = value->GetElement(pos); - - NYql::NUdf::TStringRef strRef(cell.AsStringRef()); - result.emplace_back(strRef.Data(), strRef.Size()); - } - - Underlying->OnObject(std::make_pair(offset, std::move(result))); + Buffer.Reserve(BatchSize, 1); + Parser.threaded = false; } - void OnFinish() override { - auto unguard = Unguard(Worker->GetScopedAlloc()); - Underlying->OnFinish(); + bool IsReady() const { + return Buffer.IsReady() && (Buffer.GetSize() >= BatchSize || TInstant::Now() - Buffer.CreationStartTime >= BatchCreationTimeout); } -private: - THolder>>> Underlying; - NYql::NPureCalc::IWorker* Worker; - TFieldsMapping FieldsMapping; -}; + TInstant GetCreationDeadline() const { + return Buffer.IsReady() ? Buffer.CreationStartTime + BatchCreationTimeout : TInstant::Zero(); + } -} + size_t GetNumberValues() const { + return Buffer.IsReady() ? Buffer.NumberValues : 0; + } -template <> -struct NYql::NPureCalc::TInputSpecTraits { - static constexpr bool IsPartial = false; - static constexpr bool SupportPushStreamMode = true; + const TVector& GetOffsets() { + return Buffer.Offsets; + } - using TConsumerType = THolder>; + void AddMessages(const TVector& messages) { + if (messages.empty()) { + return; + } - static TConsumerType MakeConsumer( - const TParserInputSpec& spec, - NYql::NPureCalc::TWorkerHolder worker - ) { - Y_UNUSED(spec); - return MakeHolder(std::move(worker)); + if (Buffer.Finished) { + Buffer.Clear(); + } + Buffer.AddMessages(messages); } -}; -template <> -struct NYql::NPureCalc::TOutputSpecTraits { - static const constexpr bool IsPartial = false; - static const constexpr bool SupportPushStreamMode = true; + const TVector>& Parse() { + Y_ENSURE(Buffer.IsReady(), "Nothing to parse"); - static void SetConsumerToWorker(const TParserOutputSpec& outputSpec, NYql::NPureCalc::IPushStreamWorker* worker, THolder>>> consumer) { - worker->SetConsumer(MakeHolder(outputSpec, worker, std::move(consumer))); - } -}; + const auto [values, size] = Buffer.Finish(); + LOG_ROW_DISPATCHER_TRACE("Parse values:\n" << values); -namespace NFq { + for (auto& parsedColumn : ParsedValues) { + parsedColumn.clear(); + parsedColumn.reserve(Buffer.NumberValues); + } -class TJsonParser::TImpl { -public: - TImpl( - const TVector& columns, - const TVector& types, - TCallback callback) - : Sql(GenerateSql(columns, types)) { - auto options = NYql::NPureCalc::TProgramFactoryOptions(); - auto factory = NYql::NPureCalc::MakeProgramFactory(options); - - LOG_ROW_DISPATCHER_DEBUG("Creating program..."); - Program = factory->MakePushStreamProgram( - TParserInputSpec(), - TParserOutputSpec(MakeOutputSchema(columns)), - Sql, - NYql::NPureCalc::ETranslationMode::SExpr - ); - LOG_ROW_DISPATCHER_DEBUG("Program created"); - InputConsumer = Program->Apply(MakeHolder(callback)); - LOG_ROW_DISPATCHER_DEBUG("InputConsumer created"); + size_t rowId = 0; + simdjson::ondemand::document_stream documents = Parser.iterate_many(values, size, simdjson::dom::DEFAULT_BATCH_SIZE); + for (auto document : documents) { + for (auto item : document.get_object()) { + const auto it = ColumnsIndex.find(item.escaped_key().value()); + if (it == ColumnsIndex.end()) { + continue; + } + + const auto& column = Columns[it->second]; + + std::string_view value; + if (item.value().is_null()) { + // TODO: support optional types and create UV + continue; + } else if (column.Type == "Json") { + value = item.value().raw_json().value(); + } else if (column.Type == "String" || column.Type == "Utf8") { + value = item.value().get_string().value(); + } else if (item.value().is_scalar()) { + // TODO: perform type validation and create UV + value = item.value().raw_json_token().value(); + } else { + throw yexception() << "Failed to parse json string, expected scalar type for column '" << it->first << "' with type " << column.Type << " but got nested json, please change column type to Json."; + } + + auto& parsedColumn = ParsedValues[it->second]; + parsedColumn.resize(rowId); + parsedColumn.emplace_back(CreateHolderIfNeeded(values, size, value)); + } + rowId++; + } + Y_ENSURE(rowId == Buffer.NumberValues, "Unexpected number of json documents"); + + for (auto& parsedColumn : ParsedValues) { + parsedColumn.resize(Buffer.NumberValues); + } + return ParsedValues; } - void Push( ui64 offset, const TString& value) { - LOG_ROW_DISPATCHER_TRACE("Push " << value); - InputConsumer->OnObject(std::make_pair(offset, value)); + TString GetDescription() const { + TStringBuilder description = TStringBuilder() << "Columns: "; + for (const auto& column : Columns) { + description << "'" << column.Name << "':" << column.Type << " "; + } + description << "\nNumber values in buffer: " << Buffer.NumberValues << ", buffer size: " << Buffer.GetSize() << ", finished: " << Buffer.Finished; + return description; } - TString GetSql() const { - return Sql; + TString GetDebugString(const TVector>& parsedValues) const { + TStringBuilder result; + for (size_t i = 0; i < Columns.size(); ++i) { + result << "Parsed column '" << Columns[i].Name << "': "; + for (const auto& value : parsedValues[i]) { + result << "'" << value << "' "; + } + result << "\n"; + } + return result; } private: - TString GenerateSql(const TVector& columnNames, const TVector& columnTypes) { - Y_ABORT_UNLESS(columnNames.size() == columnTypes.size(), "Unexpected column types size"); - - TStringStream udfOutputType; - TStringStream resultType; - for (size_t i = 0; i < columnNames.size(); ++i) { - const TString& lastSymbol = i + 1 == columnNames.size() ? "" : " "; - const TString& column = columnNames[i]; - const TString& type = SkipOptional(columnTypes[i]); - - udfOutputType << "'('" << column << " (DataType '" << type << "))" << lastSymbol; - resultType << "'('" << column << " (SafeCast (Member $parsed '" << column << ") $string_type))" << lastSymbol; + std::string_view CreateHolderIfNeeded(const char* dataHolder, size_t size, std::string_view value) { + ptrdiff_t diff = value.data() - dataHolder; + if (0 <= diff && static_cast(diff) < size) { + return value; } - - TStringStream str; - str << R"( - ( - (let $string_type (DataType 'String)) - - (let $input_type (TupleType $string_type (DataType 'Uint64))) - (let $output_type (TupleType (StructType )" << udfOutputType.Str() << R"() (DataType 'Uint64))) - (let $udf_argument_type (TupleType $input_type (StructType) $output_type)) - (let $udf_callable_type (CallableType '('1) '((StreamType $output_type)) '((StreamType $input_type)) '((OptionalType (DataType 'Utf8))))) - (let $udf (Udf 'ClickHouseClient.ParseFormat (Void) $udf_argument_type 'json_each_row $udf_callable_type (VoidType) '"" '())) - - (return (Map (Apply $udf (Map (Self '0) (lambda '($input) (block '( - (return '((Member $input 'data) (Member $input ')" << OffsetFieldName << R"())) - ))))) (lambda '($output) (block '( - (let $parsed (Nth $output '0)) - (return (AsStruct '(')" << OffsetFieldName << R"( (Nth $output '1)) )" << resultType.Str() << R"()) - ))))) - ) - )"; - LOG_ROW_DISPATCHER_DEBUG("GenerateSql " << str.Str()); - return str.Str(); + return Buffer.AddHolder(value); } - static TString SkipOptional(TStringBuf type) { + static TString SkipOptional(const TString& type) { if (type.StartsWith("Optional")) { - Y_ABORT_UNLESS(type.SkipPrefix("Optional<")); - Y_ABORT_UNLESS(type.ChopSuffix(">")); + TStringBuf optionalType = type; + Y_ENSURE(optionalType.SkipPrefix("Optional<"), "Unexpected type"); + Y_ENSURE(optionalType.ChopSuffix(">"), "Unexpected type"); + return TString(optionalType); } - return TString(type); + return type; } private: - THolder> Program; - THolder> InputConsumer; - const TString Sql; + const ui64 BatchSize; + const TDuration BatchCreationTimeout; + TVector Columns; + absl::flat_hash_map ColumnsIndex; + + TJsonParserBuffer Buffer; + simdjson::ondemand::parser Parser; + + TVector> ParsedValues; }; -TJsonParser::TJsonParser( - const TVector& columns, - const TVector& types, - TCallback callback) - : Impl(std::make_unique(columns, types, callback)) { -} +TJsonParser::TJsonParser(const TVector& columns, const TVector& types, ui64 batchSize, TDuration batchCreationTimeout) + : Impl(std::make_unique(columns, types, batchSize, batchCreationTimeout)) +{} TJsonParser::~TJsonParser() { } - -void TJsonParser::Push(ui64 offset, const TString& value) { - Impl->Push(offset, value); + +void TJsonParser::AddMessages(const TVector& messages) { + Impl->AddMessages(messages); +} + +bool TJsonParser::IsReady() const { + return Impl->IsReady(); +} + +TInstant TJsonParser::GetCreationDeadline() const { + return Impl->GetCreationDeadline(); +} + +size_t TJsonParser::GetNumberValues() const { + return Impl->GetNumberValues(); +} + +const TVector& TJsonParser::GetOffsets() const { + return Impl->GetOffsets(); +} + +const TVector>& TJsonParser::Parse() { + return Impl->Parse(); +} + +TString TJsonParser::GetDescription() const { + return Impl->GetDescription(); } -TString TJsonParser::GetSql() { - return Impl->GetSql(); +TString TJsonParser::GetDebugString(const TVector>& parsedValues) const { + return Impl->GetDebugString(parsedValues); } -std::unique_ptr NewJsonParser( - const TVector& columns, - const TVector& types, - TCallback callback) { - return std::unique_ptr(new TJsonParser(columns, types, callback)); +std::unique_ptr NewJsonParser(const TVector& columns, const TVector& types, ui64 batchSize, TDuration batchCreationTimeout) { + return std::unique_ptr(new TJsonParser(columns, types, batchSize, batchCreationTimeout)); } } // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/json_parser.h b/ydb/core/fq/libs/row_dispatcher/json_parser.h index cb5137105e6b..4f5f2b14e3a2 100644 --- a/ydb/core/fq/libs/row_dispatcher/json_parser.h +++ b/ydb/core/fq/libs/row_dispatcher/json_parser.h @@ -1,32 +1,30 @@ #pragma once -#include - -#include +#include namespace NFq { class TJsonParser { public: - using TCallback = std::function&&)>; - -public: - TJsonParser( - const TVector& columns, - const TVector& types, - TCallback callback); + TJsonParser(const TVector& columns, const TVector& types, ui64 batchSize, TDuration batchCreationTimeout); ~TJsonParser(); - void Push(ui64 offset, const TString& value); - TString GetSql(); + + bool IsReady() const; + TInstant GetCreationDeadline() const; + size_t GetNumberValues() const; + const TVector& GetOffsets() const; + + void AddMessages(const TVector& messages); + const TVector>& Parse(); + + TString GetDescription() const; + TString GetDebugString(const TVector>& parsedValues) const; private: class TImpl; const std::unique_ptr Impl; }; -std::unique_ptr NewJsonParser( - const TVector& columns, - const TVector& types, - TJsonParser::TCallback callback); +std::unique_ptr NewJsonParser(const TVector& columns, const TVector& types, ui64 batchSize, TDuration batchCreationTimeout); } // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/leader_election.cpp b/ydb/core/fq/libs/row_dispatcher/leader_election.cpp index 6817cfc292c0..5f945ddc9f38 100644 --- a/ydb/core/fq/libs/row_dispatcher/leader_election.cpp +++ b/ydb/core/fq/libs/row_dispatcher/leader_election.cpp @@ -222,6 +222,10 @@ void TLeaderElection::Bootstrap() { Become(&TLeaderElection::StateFunc); LogPrefix = "TLeaderElection " + SelfId().ToString() + " "; LOG_ROW_DISPATCHER_DEBUG("Successfully bootstrapped, local coordinator id " << CoordinatorId.ToString()); + if (Config.GetLocalMode()) { + TActivationContext::ActorSystem()->Send(ParentId, new NFq::TEvRowDispatcher::TEvCoordinatorChanged(CoordinatorId)); + return; + } ProcessState(); } diff --git a/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp index 3d327385cf0c..d9bf3fd9dacb 100644 --- a/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp +++ b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp @@ -108,7 +108,6 @@ class TRowDispatcher : public TActorBootstrapped { NConfig::TRowDispatcherConfig Config; - NConfig::TCommonConfig CommonConfig; NKikimr::TYdbCredentialsProviderFactory CredentialsProviderFactory; TYqSharedResources::TPtr YqSharedResources; TMaybe CoordinatorActorId; @@ -120,6 +119,7 @@ class TRowDispatcher : public TActorBootstrapped { NFq::NRowDispatcher::IActorFactory::TPtr ActorFactory; const ::NMonitoring::TDynamicCounterPtr Counters; TRowDispatcherMetrics Metrics; + NYql::IPqGateway::TPtr PqGateway; struct ConsumerCounters { ui64 NewDataArrived = 0; @@ -171,13 +171,13 @@ class TRowDispatcher : public TActorBootstrapped { public: explicit TRowDispatcher( const NConfig::TRowDispatcherConfig& config, - const NConfig::TCommonConfig& commonConfig, const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, const TYqSharedResources::TPtr& yqSharedResources, NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const TString& tenant, const NFq::NRowDispatcher::IActorFactory::TPtr& actorFactory, - const ::NMonitoring::TDynamicCounterPtr& counters); + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway); void Bootstrap(); @@ -234,15 +234,14 @@ class TRowDispatcher : public TActorBootstrapped { TRowDispatcher::TRowDispatcher( const NConfig::TRowDispatcherConfig& config, - const NConfig::TCommonConfig& commonConfig, const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, const TYqSharedResources::TPtr& yqSharedResources, NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const TString& tenant, const NFq::NRowDispatcher::IActorFactory::TPtr& actorFactory, - const ::NMonitoring::TDynamicCounterPtr& counters) + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway) : Config(config) - , CommonConfig(commonConfig) , CredentialsProviderFactory(credentialsProviderFactory) , YqSharedResources(yqSharedResources) , CredentialsFactory(credentialsFactory) @@ -250,7 +249,8 @@ TRowDispatcher::TRowDispatcher( , Tenant(tenant) , ActorFactory(actorFactory) , Counters(counters) - , Metrics(counters) { + , Metrics(counters) + , PqGateway(pqGateway) { } void TRowDispatcher::Bootstrap() { @@ -303,8 +303,8 @@ void TRowDispatcher::Handle(TEvPrivate::TEvCoordinatorPing::TPtr&) { if (!CoordinatorActorId) { return; } - LOG_ROW_DISPATCHER_DEBUG("Send ping to " << *CoordinatorActorId); - Send(*CoordinatorActorId, new NActors::TEvents::TEvPing(), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession); + LOG_ROW_DISPATCHER_TRACE("Send ping to " << *CoordinatorActorId); + Send(*CoordinatorActorId, new NActors::TEvents::TEvPing()); } void TRowDispatcher::Handle(NActors::TEvents::TEvPong::TPtr&) { @@ -391,7 +391,9 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) { CredentialsFactory, ev->Get()->Record.GetToken(), source.GetAddBearerToToken()), - Counters); + Counters, + PqGateway + ); SessionInfo& sessionInfo = topicSessionInfo.Sessions[sessionActorId]; sessionInfo.Consumers[ev->Sender] = consumerInfo; } else { @@ -586,23 +588,23 @@ void TRowDispatcher::Handle(NFq::TEvPrivate::TEvPrintState::TPtr&) { std::unique_ptr NewRowDispatcher( const NConfig::TRowDispatcherConfig& config, - const NConfig::TCommonConfig& commonConfig, const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, const TYqSharedResources::TPtr& yqSharedResources, NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const TString& tenant, const NFq::NRowDispatcher::IActorFactory::TPtr& actorFactory, - const ::NMonitoring::TDynamicCounterPtr& counters) + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway) { return std::unique_ptr(new TRowDispatcher( config, - commonConfig, credentialsProviderFactory, yqSharedResources, credentialsFactory, tenant, actorFactory, - counters)); + counters, + pqGateway)); } } // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/row_dispatcher.h b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.h index 54c3b1521afd..4ff46b2ba03f 100644 --- a/ydb/core/fq/libs/row_dispatcher/row_dispatcher.h +++ b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.h @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -16,12 +17,12 @@ namespace NFq { std::unique_ptr NewRowDispatcher( const NConfig::TRowDispatcherConfig& config, - const NConfig::TCommonConfig& commonConfig, const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, const TYqSharedResources::TPtr& yqSharedResources, NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const TString& tenant, const NFq::NRowDispatcher::IActorFactory::TPtr& actorFactory, - const ::NMonitoring::TDynamicCounterPtr& counters); + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway); } // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.cpp b/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.cpp index 1300f419d7de..dd4352e66549 100644 --- a/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.cpp +++ b/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.cpp @@ -11,22 +11,22 @@ using namespace NActors; std::unique_ptr NewRowDispatcherService( const NConfig::TRowDispatcherConfig& config, - const NConfig::TCommonConfig& commonConfig, const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, const TYqSharedResources::TPtr& yqSharedResources, NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const TString& tenant, - const ::NMonitoring::TDynamicCounterPtr& counters) + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway) { return NewRowDispatcher( config, - commonConfig, credentialsProviderFactory, yqSharedResources, credentialsFactory, tenant, NFq::NRowDispatcher::CreateActorFactory(), - counters); + counters, + pqGateway); } } // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.h b/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.h index ef8a9f29099d..1996526bd70a 100644 --- a/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.h +++ b/ydb/core/fq/libs/row_dispatcher/row_dispatcher_service.h @@ -6,6 +6,7 @@ #include #include +#include #include "events/data_plane.h" #include @@ -16,11 +17,11 @@ namespace NFq { std::unique_ptr NewRowDispatcherService( const NConfig::TRowDispatcherConfig& config, - const NConfig::TCommonConfig& commonConfig, const NKikimr::TYdbCredentialsProviderFactory& credentialsProviderFactory, const TYqSharedResources::TPtr& yqSharedResources, NYql::ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const TString& tenant, - const ::NMonitoring::TDynamicCounterPtr& counters); + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway); } // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/topic_session.cpp b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp index a1dc73bfef63..1a4c02a1f191 100644 --- a/ydb/core/fq/libs/row_dispatcher/topic_session.cpp +++ b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp @@ -10,6 +10,8 @@ #include #include #include + +#include #include #include @@ -50,10 +52,10 @@ struct TEvPrivate { EvPqEventsReady = EvBegin + 10, EvCreateSession, EvStatus, - EvDataParsed, EvDataAfterFilteration, EvDataFiltered, EvPrintState, + EvStartParsing, EvEnd }; static_assert(EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE), "expect EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE)"); @@ -63,20 +65,13 @@ struct TEvPrivate { struct TEvCreateSession : public NActors::TEventLocal {}; struct TEvPrintState : public NActors::TEventLocal {}; struct TEvStatus : public NActors::TEventLocal {}; - struct TEvDataParsed : public NActors::TEventLocal { - TEvDataParsed(ui64 offset, TList&& value) - : Offset(offset) - , Value(std::move(value)) - {} - ui64 Offset = 0; - TList Value; - }; + struct TEvStartParsing : public NActors::TEventLocal {}; struct TEvDataFiltered : public NActors::TEventLocal { - TEvDataFiltered(ui64 offset) + explicit TEvDataFiltered(ui64 offset) : Offset(offset) {} - ui64 Offset = 0; + const ui64 Offset; }; struct TEvDataAfterFilteration : public NActors::TEventLocal { @@ -99,9 +94,8 @@ TVector GetVector(const google::protobuf::RepeatedPtrField& va } class TTopicSession : public TActorBootstrapped { - private: - using TParserInputType = std::pair< TVector, TVector>; // TODO: remove after YQ-3594 + using TParserInputType = TSet>; struct ClientsInfo { ClientsInfo(const NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) @@ -120,6 +114,7 @@ class TTopicSession : public TActorBootstrapped { bool DataArrivedSent = false; TMaybe NextMessageOffset; ui64 LastSendedNextMessageOffset = 0; + TVector FieldsIds; }; struct TTopicEventProcessor { @@ -136,26 +131,34 @@ class TTopicSession : public TActorBootstrapped { const TString& LogPrefix; }; + struct TParserSchema { + TVector FieldsMap; // index - FieldId (from FieldsIndexes), value - parsing schema offset + TParserInputType InputType; + }; + const TString TopicPath; NActors::TActorId RowDispatcherActorId; ui32 PartitionId; NYdb::TDriver Driver; std::shared_ptr CredentialsProviderFactory; - std::unique_ptr TopicClient; + NYql::ITopicClient::TPtr TopicClient; std::shared_ptr ReadSession; const i64 BufferSize; TString LogPrefix; NYql::NDq::TDqAsyncStats IngressStats; ui64 LastMessageOffset = 0; bool IsWaitingEvents = false; + bool IsStartParsingScheduled = false; THashMap Clients; THashSet ClientsWithoutPredicate; std::unique_ptr Parser; NConfig::TRowDispatcherConfig Config; ui64 UsedSize = 0; - TMaybe CurrentParserTypes; const ::NMonitoring::TDynamicCounterPtr Counters; TTopicSessionMetrics Metrics; + TParserSchema ParserSchema; + THashMap FieldsIndexes; + NYql::IPqGateway::TPtr PqGateway; public: explicit TTopicSession( @@ -165,7 +168,8 @@ class TTopicSession : public TActorBootstrapped { ui32 partitionId, NYdb::TDriver driver, std::shared_ptr credentialsProviderFactory, - const ::NMonitoring::TDynamicCounterPtr& counters); + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway); void Bootstrap(); void PassAway() override; @@ -174,14 +178,16 @@ class TTopicSession : public TActorBootstrapped { private: NYdb::NTopic::TTopicClientSettings GetTopicClientSettings(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) const; - NYdb::NTopic::TTopicClient& GetTopicClient(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams); + NYql::ITopicClient& GetTopicClient(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams); NYdb::NTopic::TReadSessionSettings GetReadSessionSettings(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) const; void CreateTopicSession(); void CloseTopicSession(); void SubscribeOnNextEvent(); - void SendToParsing(ui64 offset, const TString& message); + void SendToParsing(const TVector& messages); + void DoParsing(bool force = false); + void DoFiltering(const TVector& offsets, const TVector>& parsedValues); void SendData(ClientsInfo& info); - void InitParser(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams); + void UpdateParser(); void FatalError(const TString& message, const std::unique_ptr* filter = nullptr); void SendDataArrived(ClientsInfo& client); void StopReadSession(); @@ -194,7 +200,6 @@ class TTopicSession : public TActorBootstrapped { void Handle(NFq::TEvPrivate::TEvPqEventsReady::TPtr&); void Handle(NFq::TEvPrivate::TEvCreateSession::TPtr&); - void Handle(NFq::TEvPrivate::TEvDataParsed::TPtr&); void Handle(NFq::TEvPrivate::TEvDataAfterFilteration::TPtr&); void Handle(NFq::TEvPrivate::TEvStatus::TPtr&); void Handle(NFq::TEvPrivate::TEvDataFiltered::TPtr&); @@ -206,19 +211,22 @@ class TTopicSession : public TActorBootstrapped { void PrintInternalState(); void SendSessionError(NActors::TActorId readActorId, const TString& message); + TVector> RebuildJson(const ClientsInfo& info, const TVector>& parsedValues); + void UpdateParserSchema(const TParserInputType& inputType); + void UpdateFieldsIds(ClientsInfo& clientInfo); private: STRICT_STFUNC_EXC(StateFunc, hFunc(NFq::TEvPrivate::TEvPqEventsReady, Handle); hFunc(NFq::TEvPrivate::TEvCreateSession, Handle); - hFunc(NFq::TEvPrivate::TEvDataParsed, Handle); hFunc(NFq::TEvPrivate::TEvDataAfterFilteration, Handle); hFunc(NFq::TEvPrivate::TEvStatus, Handle); hFunc(NFq::TEvPrivate::TEvDataFiltered, Handle); hFunc(NFq::TEvPrivate::TEvPrintState, Handle); hFunc(TEvRowDispatcher::TEvGetNextBatch, Handle); hFunc(NFq::TEvRowDispatcher::TEvStartSession, Handle); + sFunc(NFq::TEvPrivate::TEvStartParsing, DoParsing); cFunc(NActors::TEvents::TEvPoisonPill::EventType, PassAway); hFunc(NFq::TEvRowDispatcher::TEvStopSession, Handle);, ExceptionFunc(std::exception, HandleException) @@ -228,7 +236,6 @@ class TTopicSession : public TActorBootstrapped { cFunc(NActors::TEvents::TEvPoisonPill::EventType, PassAway); IgnoreFunc(NFq::TEvPrivate::TEvPqEventsReady); IgnoreFunc(NFq::TEvPrivate::TEvCreateSession); - IgnoreFunc(NFq::TEvPrivate::TEvDataParsed); IgnoreFunc(NFq::TEvPrivate::TEvDataAfterFilteration); IgnoreFunc(NFq::TEvPrivate::TEvStatus); IgnoreFunc(NFq::TEvPrivate::TEvDataFiltered); @@ -246,7 +253,8 @@ TTopicSession::TTopicSession( ui32 partitionId, NYdb::TDriver driver, std::shared_ptr credentialsProviderFactory, - const ::NMonitoring::TDynamicCounterPtr& counters) + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway) : TopicPath(topicPath) , RowDispatcherActorId(rowDispatcherActorId) , PartitionId(partitionId) @@ -256,6 +264,7 @@ TTopicSession::TTopicSession( , LogPrefix("TopicSession") , Config(config) , Counters(counters) + , PqGateway(pqGateway) { } @@ -304,9 +313,9 @@ NYdb::NTopic::TTopicClientSettings TTopicSession::GetTopicClientSettings(const N return opts; } -NYdb::NTopic::TTopicClient& TTopicSession::GetTopicClient(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) { +NYql::ITopicClient& TTopicSession::GetTopicClient(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) { if (!TopicClient) { - TopicClient = std::make_unique(Driver, GetTopicClientSettings(sourceParams)); + TopicClient = PqGateway->GetTopicClient(Driver, GetTopicClientSettings(sourceParams)); } return *TopicClient; } @@ -348,11 +357,11 @@ void TTopicSession::CreateTopicSession() { return; } - // Use any sourceParams. - const NYql::NPq::NProto::TDqPqTopicSource& sourceParams = Clients.begin()->second.Settings.GetSource(); - if (!ReadSession) { - InitParser(sourceParams); + UpdateParser(); + + // Use any sourceParams. + const NYql::NPq::NProto::TDqPqTopicSource& sourceParams = Clients.begin()->second.Settings.GetSource(); ReadSession = GetTopicClient(sourceParams).CreateReadSession(GetReadSessionSettings(sourceParams)); SubscribeOnNextEvent(); } @@ -370,29 +379,21 @@ void TTopicSession::Handle(NFq::TEvPrivate::TEvCreateSession::TPtr&) { CreateTopicSession(); } -void TTopicSession::Handle(NFq::TEvPrivate::TEvDataParsed::TPtr& ev) { - LOG_ROW_DISPATCHER_TRACE("TEvDataParsed, offset " << ev->Get()->Offset); - - for (auto v: ev->Get()->Value) { - LOG_ROW_DISPATCHER_TRACE("v " << v); +TVector> TTopicSession::RebuildJson(const ClientsInfo& info, const TVector>& parsedValues) { + TVector> result; + const auto& offsets = ParserSchema.FieldsMap; + result.reserve(info.FieldsIds.size()); + for (auto fieldId : info.FieldsIds) { + Y_ENSURE(fieldId < offsets.size(), "fieldId " << fieldId << ", offsets.size() " << offsets.size()); + auto offset = offsets[fieldId]; + Y_ENSURE(offset < parsedValues.size(), "offset " << offset << ", jsonBatch.size() " << parsedValues.size()); + result.push_back(parsedValues[offset]); } - - for (auto& [actorId, info] : Clients) { - try { - if (!info.Filter) { - continue; - } - info.Filter->Push(ev->Get()->Offset, ev->Get()->Value); - } catch (const std::exception& e) { - FatalError(e.what(), &info.Filter); - } - } - auto event = std::make_unique(ev->Get()->Offset); - Send(SelfId(), event.release()); + return result; } void TTopicSession::Handle(NFq::TEvPrivate::TEvDataAfterFilteration::TPtr& ev) { - LOG_ROW_DISPATCHER_TRACE("TEvDataAfterFilteration, read actor id " << ev->Get()->ReadActorId.ToString()); + LOG_ROW_DISPATCHER_TRACE("TEvDataAfterFilteration, read actor id " << ev->Get()->ReadActorId.ToString() << ", " << ev->Get()->Json); auto it = Clients.find(ev->Get()->ReadActorId); if (it == Clients.end()) { LOG_ROW_DISPATCHER_ERROR("Skip DataAfterFilteration, wrong read actor, id " << ev->Get()->ReadActorId.ToString()); @@ -422,10 +423,9 @@ void TTopicSession::Handle(NFq::TEvPrivate::TEvStatus::TPtr&) { } void TTopicSession::Handle(NFq::TEvPrivate::TEvDataFiltered::TPtr& ev) { - LOG_ROW_DISPATCHER_TRACE("TEvDataFiltered, offset " << ev->Get()->Offset); + LOG_ROW_DISPATCHER_TRACE("TEvDataFiltered, last offset " << ev->Get()->Offset); for (auto& [actorId, info] : Clients) { - if (!info.NextMessageOffset - || *info.NextMessageOffset < ev->Get()->Offset + 1) { + if (!info.NextMessageOffset || *info.NextMessageOffset < ev->Get()->Offset + 1) { info.NextMessageOffset = ev->Get()->Offset + 1; } } @@ -457,7 +457,7 @@ void TTopicSession::HandleNewEvents() { break; } std::visit(TTopicEventProcessor{*this, LogPrefix}, *event); - } + } } void TTopicSession::CloseTopicSession() { @@ -472,14 +472,13 @@ void TTopicSession::CloseTopicSession() { void TTopicSession::TTopicEventProcessor::operator()(NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent& event) { Self.Metrics.RowsRead->Add(event.GetMessages().size()); for (const auto& message : event.GetMessages()) { - const TString& data = message.GetData(); - Self.IngressStats.Bytes += data.size(); LOG_ROW_DISPATCHER_TRACE("Data received: " << message.DebugString(true)); - TString item = message.GetData(); - Self.SendToParsing(message.GetOffset(), item); + Self.IngressStats.Bytes += message.GetData().size(); Self.LastMessageOffset = message.GetOffset(); } + + Self.SendToParsing(event.GetMessages()); } void TTopicSession::TTopicEventProcessor::operator()(NYdb::NTopic::TSessionClosedEvent& ev) { @@ -528,16 +527,18 @@ TString TTopicSession::GetSessionId() const { return ReadSession ? ReadSession->GetSessionId() : TString{"empty"}; } -void TTopicSession::SendToParsing(ui64 offset, const TString& message) { - LOG_ROW_DISPATCHER_TRACE("SendToParsing, message " << message); - - for (auto& readActorId : ClientsWithoutPredicate) { - auto it = Clients.find(readActorId); +void TTopicSession::SendToParsing(const TVector& messages) { + for (const auto& readActorId : ClientsWithoutPredicate) { + const auto it = Clients.find(readActorId); Y_ENSURE(it != Clients.end(), "Internal error: unknown client"); auto& info = it->second; - if (!info.Filter) { - LOG_ROW_DISPATCHER_TRACE("Send message to client without parsing/filtering"); - AddDataToClient(info, offset, message); + if (info.Filter) { + continue; + } + + for (const auto& message : messages) { + LOG_ROW_DISPATCHER_TRACE("Send message with offset " << message.GetOffset() << " to client " << info.ReadActorId <<" without parsing/filtering"); + AddDataToClient(info, message.GetOffset(), message.GetData()); } } @@ -545,13 +546,53 @@ void TTopicSession::SendToParsing(ui64 offset, const TString& message) { return; } + Parser->AddMessages(messages); + DoParsing(); +} + +void TTopicSession::DoParsing(bool force) { + if (!Parser->IsReady() && !force) { + const TInstant batchCreationDeadline = Parser->GetCreationDeadline(); + LOG_ROW_DISPATCHER_TRACE("Collecting data to parse, skip parsing, creation deadline " << batchCreationDeadline); + if (!IsStartParsingScheduled && batchCreationDeadline) { + IsStartParsingScheduled = true; + Schedule(batchCreationDeadline, new TEvPrivate::TEvStartParsing()); + } + return; + } + + if (!Parser->GetNumberValues()) { + return; + } + + IsStartParsingScheduled = false; + LOG_ROW_DISPATCHER_TRACE("SendToParsing, first offset: " << Parser->GetOffsets().front() << ", number values in buffer " << Parser->GetOffsets().size()); + try { - Parser->Push(offset, message); + const auto& parsedValues = Parser->Parse(); + DoFiltering(Parser->GetOffsets(), parsedValues); } catch (const std::exception& e) { FatalError(e.what()); } } +void TTopicSession::DoFiltering(const TVector& offsets, const TVector>& parsedValues) { + Y_ENSURE(parsedValues, "Expected non empty schema"); + LOG_ROW_DISPATCHER_TRACE("SendToFiltering, first offset: " << offsets.front() << ", last offset: " << offsets.back() << ", data:\n" << Parser->GetDebugString(parsedValues)); + + for (auto& [actorId, info] : Clients) { + try { + if (info.Filter) { + info.Filter->Push(offsets, RebuildJson(info, parsedValues)); + } + } catch (const std::exception& e) { + FatalError(e.what(), &info.Filter); + } + } + + Send(SelfId(), new TEvPrivate::TEvDataFiltered(offsets.back())); +} + void TTopicSession::SendData(ClientsInfo& info) { info.DataArrivedSent = false; if (info.Buffer.empty()) { @@ -590,6 +631,19 @@ void TTopicSession::SendData(ClientsInfo& info) { info.LastSendedNextMessageOffset = *info.NextMessageOffset; } +void TTopicSession::UpdateFieldsIds(ClientsInfo& info) { + for (auto name : info.Settings.GetSource().GetColumns()) { + auto it = FieldsIndexes.find(name); + if (it == FieldsIndexes.end()) { + auto nextIndex = FieldsIndexes.size(); + info.FieldsIds.push_back(nextIndex); + FieldsIndexes[name] = nextIndex; + } else { + info.FieldsIds.push_back(it->second); + } + } +} + void TTopicSession::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) { auto it = Clients.find(ev->Sender); if (it != Clients.end()) { @@ -597,21 +651,23 @@ void TTopicSession::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) { return; } - LOG_ROW_DISPATCHER_INFO("New client, read actor id " << ev->Sender.ToString()); + LOG_ROW_DISPATCHER_INFO("New client: read actor id " << ev->Sender.ToString() << ", predicate: " + << ev->Get()->Record.GetSource().GetPredicate() << ", offset: " << ev->Get()->Record.GetOffset()); auto columns = GetVector(ev->Get()->Record.GetSource().GetColumns()); auto types = GetVector(ev->Get()->Record.GetSource().GetColumnTypes()); - auto parserType = std::make_pair(columns, types); - if (CurrentParserTypes && *CurrentParserTypes != parserType) { - SendSessionError(ev->Sender, "Different columns/types, use same in all queries"); - return; - } try { + if (Parser) { + // Parse remains data before adding new client + DoParsing(true); + } + auto& clientInfo = Clients.emplace( std::piecewise_construct, std::forward_as_tuple(ev->Sender), std::forward_as_tuple(ev)).first->second; + UpdateFieldsIds(clientInfo); TString predicate = clientInfo.Settings.GetSource().GetPredicate(); if (!predicate.empty()) { @@ -626,11 +682,9 @@ void TTopicSession::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) { ClientsWithoutPredicate.insert(ev->Sender); } - LOG_ROW_DISPATCHER_INFO("New client: offset " << clientInfo.NextMessageOffset << ", predicate: " << clientInfo.Settings.GetSource().GetPredicate()); - if (ReadSession) { if (clientInfo.Settings.HasOffset() && (clientInfo.Settings.GetOffset() <= LastMessageOffset)) { - LOG_ROW_DISPATCHER_INFO("New client has less offset than the last message, stop (restart) topic session"); + LOG_ROW_DISPATCHER_INFO("New client has less offset (" << clientInfo.Settings.GetOffset() << ") than the last message (" << LastMessageOffset << "), stop (restart) topic session"); StopReadSession(); } } @@ -641,7 +695,7 @@ void TTopicSession::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) { } catch (...) { FatalError("Adding new client failed, " + CurrentExceptionMessage()); } - + UpdateParser(); PrintInternalState(); if (!ReadSession) { Schedule(TDuration::Seconds(Config.GetTimeoutBeforeStartSessionSec()), new NFq::TEvPrivate::TEvCreateSession()); @@ -665,26 +719,70 @@ void TTopicSession::Handle(NFq::TEvRowDispatcher::TEvStopSession::TPtr& ev) { auto it = Clients.find(ev->Sender); if (it == Clients.end()) { - LOG_ROW_DISPATCHER_DEBUG("Wrong ClientSettings"); // TODO + LOG_ROW_DISPATCHER_DEBUG("Wrong ClientSettings"); return; } Clients.erase(it); ClientsWithoutPredicate.erase(ev->Sender); + if (Clients.empty()) { + StopReadSession(); + } + UpdateParser(); } -void TTopicSession::InitParser(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) { - if (Parser) { +void CollectColumns(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams, TSet>& columns) { + auto size = sourceParams.GetColumns().size(); + Y_ENSURE(size == sourceParams.GetColumnTypes().size()); + + for (int i = 0; i < size; ++i) { + auto name = sourceParams.GetColumns().Get(i); + auto type = sourceParams.GetColumnTypes().Get(i); + columns.emplace(name, type); + } +} + +void TTopicSession::UpdateParserSchema(const TParserInputType& inputType) { + ParserSchema.FieldsMap.clear(); + ParserSchema.FieldsMap.resize(FieldsIndexes.size()); + ui64 offset = 0; + for (const auto& [name, type]: inputType) { + Y_ENSURE(FieldsIndexes.contains(name)); + ui64 index = FieldsIndexes[name]; + ParserSchema.FieldsMap[index] = offset++; + } + ParserSchema.InputType = inputType; +} + +void TTopicSession::UpdateParser() { + TSet> namesWithTypes; + for (auto& [readActorId, info] : Clients) { + CollectColumns(info.Settings.GetSource(), namesWithTypes); + } + + if (namesWithTypes == ParserSchema.InputType) { return; } + if (namesWithTypes.empty()) { + LOG_ROW_DISPATCHER_INFO("No columns to parse, reset parser"); + Parser.reset(); + return; + } + try { - CurrentParserTypes = std::make_pair(GetVector(sourceParams.GetColumns()), GetVector(sourceParams.GetColumnTypes())); - NActors::TActorSystem* actorSystem = NActors::TActivationContext::ActorSystem(); - Parser = NewJsonParser( - GetVector(sourceParams.GetColumns()), - GetVector(sourceParams.GetColumnTypes()), - [actorSystem, selfId = SelfId()](ui64 offset, TList&& value){ - actorSystem->Send(selfId, new NFq::TEvPrivate::TEvDataParsed(offset, std::move(value))); - }); + UpdateParserSchema(namesWithTypes); + + TVector names; + TVector types; + names.reserve(namesWithTypes.size()); + types.reserve(namesWithTypes.size()); + for (const auto& [name, type] : namesWithTypes) { + names.push_back(name); + types.push_back(type); + } + + LOG_ROW_DISPATCHER_TRACE("Init JsonParser with columns: " << JoinSeq(',', names)); + const auto& parserConfig = Config.GetJsonParser(); + Parser = NewJsonParser(names, types, parserConfig.GetBatchSizeBytes(), TDuration::MilliSeconds(parserConfig.GetBatchCreationTimeoutMs())); } catch (const NYql::NPureCalc::TCompileError& e) { FatalError(e.GetIssues()); } @@ -694,10 +792,10 @@ void TTopicSession::FatalError(const TString& message, const std::unique_ptrGetSql(); + str << ", parser description:\n" << Parser->GetDescription(); } if (filter) { - str << ", filter sql:" << (*filter)->GetSql(); + str << ", filter sql:\n" << (*filter)->GetSql(); } LOG_ROW_DISPATCHER_ERROR("FatalError: " << str.Str()); @@ -724,7 +822,7 @@ void TTopicSession::StopReadSession() { ReadSession->Close(TDuration::Zero()); ReadSession.reset(); } - TopicClient.reset(); + TopicClient.Reset(); } void TTopicSession::SendDataArrived(ClientsInfo& info) { @@ -775,8 +873,9 @@ std::unique_ptr NewTopicSession( ui32 partitionId, NYdb::TDriver driver, std::shared_ptr credentialsProviderFactory, - const ::NMonitoring::TDynamicCounterPtr& counters) { - return std::unique_ptr(new TTopicSession(topicPath, config, rowDispatcherActorId, partitionId, std::move(driver), credentialsProviderFactory, counters)); + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway) { + return std::unique_ptr(new TTopicSession(topicPath, config, rowDispatcherActorId, partitionId, std::move(driver), credentialsProviderFactory, counters, pqGateway)); } } // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/topic_session.h b/ydb/core/fq/libs/row_dispatcher/topic_session.h index b3980cce8269..17ca62dda546 100644 --- a/ydb/core/fq/libs/row_dispatcher/topic_session.h +++ b/ydb/core/fq/libs/row_dispatcher/topic_session.h @@ -5,7 +5,10 @@ #include #include + #include +#include + #include #include @@ -19,6 +22,7 @@ std::unique_ptr NewTopicSession( ui32 partitionId, NYdb::TDriver driver, std::shared_ptr credentialsProviderFactory, - const ::NMonitoring::TDynamicCounterPtr& counters); + const ::NMonitoring::TDynamicCounterPtr& counters, + const NYql::IPqGateway::TPtr& pqGateway); } // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp index 1645f521051d..7682485b4644 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp @@ -1,3 +1,5 @@ +#include + #include #include @@ -23,6 +25,8 @@ class TFixture : public NUnitTest::TBaseFixture { TAutoPtr app = new TAppPrepare(); Runtime.Initialize(app->Unwrap()); Runtime.SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_DEBUG); + + NKikimr::EnableYDBBacktraceFormat(); } void TearDown(NUnitTest::TTestContext& /* context */) override { @@ -56,8 +60,8 @@ Y_UNIT_TEST_SUITE(TJsonFilterTests) { [&](ui64 offset, const TString& json) { result[offset] = json; }); - Filter->Push(5, {"hello1", "99"}); - Filter->Push(6, {"hello2", "101"}); + Filter->Push({5}, {{"hello1"}, {"99"}}); + Filter->Push({6}, {{"hello2"}, {"101"}}); UNIT_ASSERT_VALUES_EQUAL(1, result.size()); UNIT_ASSERT_VALUES_EQUAL(R"({"a1":"hello2","a2":101})", result[6]); } @@ -71,21 +75,51 @@ Y_UNIT_TEST_SUITE(TJsonFilterTests) { [&](ui64 offset, const TString& json) { result[offset] = json; }); - Filter->Push(5, {"99", "hello1"}); - Filter->Push(6, {"101", "hello2"}); + Filter->Push({5}, {{"99"}, {"hello1"}}); + Filter->Push({6}, {{"101"}, {"hello2"}}); UNIT_ASSERT_VALUES_EQUAL(1, result.size()); UNIT_ASSERT_VALUES_EQUAL(R"({"a1":"hello2","a2":101})", result[6]); + UNIT_ASSERT_EXCEPTION_CONTAINS(Filter->Push({7}, {{"102"}, {std::string_view()}}), yexception, "Failed to unwrap empty optional"); + UNIT_ASSERT_EXCEPTION_CONTAINS(Filter->Push({8}, {{"str"}, {"hello3"}}), yexception, "Failed to unwrap empty optional"); } - Y_UNIT_TEST_F(ThrowExceptionByError, TFixture) { + Y_UNIT_TEST_F(ManyValues, TFixture) { + TMap result; + MakeFilter( + {"a1", "a2"}, + {"String", "UInt64"}, + "where a2 > 100", + [&](ui64 offset, const TString& json) { + result[offset] = json; + }); + Filter->Push({5, 6}, {{"hello1", "hello2"}, {"99", "101"}}); + UNIT_ASSERT_VALUES_EQUAL(1, result.size()); + UNIT_ASSERT_VALUES_EQUAL(R"({"a1":"hello2","a2":101})", result[6]); + } + + Y_UNIT_TEST_F(NullValues, TFixture) { + TMap result; + MakeFilter( + {"a1", "a2"}, + {"Optional", "String"}, + "where a1 is null", + [&](ui64 offset, const TString& json) { + result[offset] = json; + }); + Filter->Push({5}, {{std::string_view()}, {"str"}}); + UNIT_ASSERT_VALUES_EQUAL(1, result.size()); + UNIT_ASSERT_VALUES_EQUAL(R"({"a1":null,"a2":"str"})", result[5]); + UNIT_ASSERT_EXCEPTION_CONTAINS(Filter->Push({5}, {{"hello1"}, {"str"}}), yexception, "Failed to unwrap empty optional"); + } + + Y_UNIT_TEST_F(ThrowExceptionByError, TFixture) { MakeFilter( {"a1", "a2"}, {"String", "UInt64"}, "where Unwrap(a2) = 1", [&](ui64, const TString&) { }); - UNIT_ASSERT_EXCEPTION_CONTAINS(Filter->Push(5, {"99", "hello1"}), yexception, "Failed to unwrap empty optional"); - } + UNIT_ASSERT_EXCEPTION_CONTAINS(Filter->Push({5}, {{"99"}, {"hello1"}}), yexception, "Failed to unwrap empty optional"); + } } } - diff --git a/ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp index a9c389d3900f..28242a1ebc74 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp @@ -7,10 +7,10 @@ #include #include -#include - #include +#include + namespace { using namespace NKikimr; @@ -24,8 +24,9 @@ class TFixture : public NUnitTest::TBaseFixture { void SetUp(NUnitTest::TTestContext&) override { TAutoPtr app = new TAppPrepare(); + Runtime.SetLogBackend(CreateStderrBackend()); + Runtime.SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_TRACE); Runtime.Initialize(app->Unwrap()); - Runtime.SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_DEBUG); } void TearDown(NUnitTest::TTestContext& /* context */) override { @@ -34,89 +35,165 @@ class TFixture : public NUnitTest::TBaseFixture { } } - void MakeParser(TVector columns, TVector types, NFq::TJsonParser::TCallback callback) { - try { - Parser = NFq::NewJsonParser( - columns, - types, - callback); - } catch (NYql::NPureCalc::TCompileError compileError) { - UNIT_ASSERT_C(false, TStringBuilder() << "Failed to create json parser: " << compileError.what() << "\nQuery text:\n" << compileError.GetYql() << "Reason:\n" << compileError.GetIssues()); + void MakeParser(TVector columns, TVector types) { + Parser = NFq::NewJsonParser(columns, types, 0, TDuration::Zero()); + } + + void MakeParser(TVector columns) { + MakeParser(columns, TVector(columns.size(), "String")); + } + + void PushToParser(ui64 offset, const TString& data) { + Parser->AddMessages({GetMessage(offset, data)}); + + ParsedValues = Parser->Parse(); + ResultNumberValues = ParsedValues ? ParsedValues.front().size() : 0; + } + + TVector GetParsedRow(size_t id) const { + TVector result; + result.reserve(ParsedValues.size()); + for (const auto& columnResult : ParsedValues) { + result.emplace_back(columnResult[id]); } + return result; } - void MakeParser(TVector columns, NFq::TJsonParser::TCallback callback) { - MakeParser(columns, TVector(columns.size(), "String"), callback); + static NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage GetMessage(ui64 offset, const TString& data) { + NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessageInformation info(offset, "", 0, TInstant::Zero(), TInstant::Zero(), nullptr, nullptr, 0, ""); + return NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage(data, nullptr, info, nullptr); } TActorSystemStub actorSystemStub; NActors::TTestActorRuntime Runtime; std::unique_ptr Parser; + + ui64 ResultNumberValues = 0; + TVector> ParsedValues; }; Y_UNIT_TEST_SUITE(TJsonParserTests) { - Y_UNIT_TEST_F(Simple1, TFixture) { - TList result; - ui64 resultOffset; - MakeParser({"a1", "a2"}, {"String", "Optional"}, [&](ui64 offset, TList&& value){ - resultOffset = offset; - result = std::move(value); - }); - Parser->Push(5, R"({"a1": "hello1", "a2": 101, "event": "event1"})"); - UNIT_ASSERT_VALUES_EQUAL(5, resultOffset); + Y_UNIT_TEST_F(Simple1, TFixture) { + MakeParser({"a1", "a2"}, {"String", "Optional"}); + PushToParser(42,R"({"a1": "hello1", "a2": 101, "event": "event1"})"); + UNIT_ASSERT_VALUES_EQUAL(1, ResultNumberValues); + + const auto& result = GetParsedRow(0); UNIT_ASSERT_VALUES_EQUAL(2, result.size()); UNIT_ASSERT_VALUES_EQUAL("hello1", result.front()); UNIT_ASSERT_VALUES_EQUAL("101", result.back()); } - Y_UNIT_TEST_F(Simple2, TFixture) { - TList result; - ui64 resultOffset; - MakeParser({"a2", "a1"}, [&](ui64 offset, TList&& value){ - resultOffset = offset; - result = std::move(value); - }); - Parser->Push(5, R"({"a1": "hello1", "a2": "101", "event": "event1"})"); - UNIT_ASSERT_VALUES_EQUAL(5, resultOffset); + Y_UNIT_TEST_F(Simple2, TFixture) { + MakeParser({"a2", "a1"}); + PushToParser(42,R"({"a1": "hello1", "a2": "101", "event": "event1"})"); + UNIT_ASSERT_VALUES_EQUAL(1, ResultNumberValues); + + const auto& result = GetParsedRow(0); UNIT_ASSERT_VALUES_EQUAL(2, result.size()); UNIT_ASSERT_VALUES_EQUAL("101", result.front()); UNIT_ASSERT_VALUES_EQUAL("hello1", result.back()); } - Y_UNIT_TEST_F(Simple3, TFixture) { - TList result; - ui64 resultOffset; - MakeParser({"a1", "a2"}, [&](ui64 offset, TList&& value){ - resultOffset = offset; - result = std::move(value); - }); - Parser->Push(5, R"({"a2": "hello1", "a1": "101", "event": "event1"})"); - UNIT_ASSERT_VALUES_EQUAL(5, resultOffset); + Y_UNIT_TEST_F(Simple3, TFixture) { + MakeParser({"a1", "a2"}); + PushToParser(42,R"({"a2": "hello1", "a1": "101", "event": "event1"})"); + UNIT_ASSERT_VALUES_EQUAL(1, ResultNumberValues); + + const auto& result = GetParsedRow(0); UNIT_ASSERT_VALUES_EQUAL(2, result.size()); UNIT_ASSERT_VALUES_EQUAL("101", result.front()); UNIT_ASSERT_VALUES_EQUAL("hello1", result.back()); } - Y_UNIT_TEST_F(Simple4, TFixture) { - TList result; - ui64 resultOffset; - MakeParser({"a2", "a1"}, [&](ui64 offset, TList&& value){ - resultOffset = offset; - result = std::move(value); - }); - Parser->Push(5, R"({"a2": "hello1", "a1": "101", "event": "event1"})"); - UNIT_ASSERT_VALUES_EQUAL(5, resultOffset); + Y_UNIT_TEST_F(Simple4, TFixture) { + MakeParser({"a2", "a1"}); + PushToParser(42, R"({"a2": "hello1", "a1": "101", "event": "event1"})"); + UNIT_ASSERT_VALUES_EQUAL(1, ResultNumberValues); + + const auto& result = GetParsedRow(0); UNIT_ASSERT_VALUES_EQUAL(2, result.size()); UNIT_ASSERT_VALUES_EQUAL("hello1", result.front()); UNIT_ASSERT_VALUES_EQUAL("101", result.back()); } - Y_UNIT_TEST_F(ThrowExceptionByError, TFixture) { + Y_UNIT_TEST_F(ManyValues, TFixture) { + MakeParser({"a1", "a2"}); + + Parser->AddMessages({ + GetMessage(42, R"({"a1": "hello1", "a2": "101", "event": "event1"})"), + GetMessage(43, R"({"a1": "hello1", "a2": "101", "event": "event2"})"), + GetMessage(44, R"({"a2": "101", "a1": "hello1", "event": "event3"})") + }); + + ParsedValues = Parser->Parse(); + ResultNumberValues = ParsedValues.front().size(); + UNIT_ASSERT_VALUES_EQUAL(3, ResultNumberValues); + for (size_t i = 0; i < ResultNumberValues; ++i) { + const auto& result = GetParsedRow(i); + UNIT_ASSERT_VALUES_EQUAL_C(2, result.size(), i); + UNIT_ASSERT_VALUES_EQUAL_C("hello1", result.front(), i); + UNIT_ASSERT_VALUES_EQUAL_C("101", result.back(), i); + } + } + + Y_UNIT_TEST_F(MissingFields, TFixture) { + MakeParser({"a1", "a2"}); + + Parser->AddMessages({ + GetMessage(42, R"({"a1": "hello1", "a2": "101", "event": "event1"})"), + GetMessage(43, R"({"a1": "hello1", "event": "event2"})"), + GetMessage(44, R"({"a2": "101", "a1": null, "event": "event3"})") + }); + + ParsedValues = Parser->Parse(); + ResultNumberValues = ParsedValues.front().size(); + UNIT_ASSERT_VALUES_EQUAL(3, ResultNumberValues); + for (size_t i = 0; i < ResultNumberValues; ++i) { + const auto& result = GetParsedRow(i); + UNIT_ASSERT_VALUES_EQUAL_C(2, result.size(), i); + UNIT_ASSERT_VALUES_EQUAL_C(i != 2 ? "hello1" : "", result.front(), i); + UNIT_ASSERT_VALUES_EQUAL_C(i != 1 ? "101" : "", result.back(), i); + } + } + + Y_UNIT_TEST_F(NestedTypes, TFixture) { + MakeParser({"nested", "a1"}, {"Optional", "String"}); + + Parser->AddMessages({ + GetMessage(42, R"({"a1": "hello1", "nested": {"key": "value"}})"), + GetMessage(43, R"({"a1": "hello1", "nested": ["key1", "key2"]})") + }); + + ParsedValues = Parser->Parse(); + ResultNumberValues = ParsedValues.front().size(); + UNIT_ASSERT_VALUES_EQUAL(2, ResultNumberValues); + + const auto& nestedJson = GetParsedRow(0); + UNIT_ASSERT_VALUES_EQUAL(2, nestedJson.size()); + UNIT_ASSERT_VALUES_EQUAL("{\"key\": \"value\"}", nestedJson.front()); + UNIT_ASSERT_VALUES_EQUAL("hello1", nestedJson.back()); - MakeParser({"a2", "a1"}, [&](ui64, TList&&){ }); - UNIT_ASSERT_EXCEPTION_CONTAINS(Parser->Push(5, R"(ydb)"), yexception, "DB::ParsingException: Cannot parse input: expected '{' before: 'ydb': (at row 1)"); + const auto& nestedList = GetParsedRow(1); + UNIT_ASSERT_VALUES_EQUAL(2, nestedList.size()); + UNIT_ASSERT_VALUES_EQUAL("[\"key1\", \"key2\"]", nestedList.front()); + UNIT_ASSERT_VALUES_EQUAL("hello1", nestedList.back()); + } + + Y_UNIT_TEST_F(StringTypeValidation, TFixture) { + MakeParser({"a1"}, {"String"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a1": 1234})"), simdjson::simdjson_error, "INCORRECT_TYPE: The JSON element does not have the requested type."); } -} + Y_UNIT_TEST_F(JsonTypeValidation, TFixture) { + MakeParser({"a1"}, {"Int32"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a1": {"key": "value"}})"), yexception, "Failed to parse json string, expected scalar type for column 'a1' with type Int32 but got nested json, please change column type to Json."); + } + + Y_UNIT_TEST_F(ThrowExceptionByError, TFixture) { + MakeParser({"a2", "a1"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"(ydb)"), simdjson::simdjson_error, "INCORRECT_TYPE: The JSON element does not have the requested type."); + } } +} diff --git a/ydb/core/fq/libs/row_dispatcher/ut/leader_election_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/leader_election_ut.cpp index 93ccaa8c151e..bdef4408327e 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/leader_election_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/leader_election_ut.cpp @@ -23,15 +23,18 @@ class TFixture : public NUnitTest::TBaseFixture { Runtime.Initialize(app->Unwrap()); Runtime.SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_DEBUG); auto credFactory = NKikimr::CreateYdbCredentialsProviderFactory; - auto yqSharedResources = NFq::TYqSharedResources::Cast(NFq::CreateYqSharedResourcesImpl({}, credFactory, MakeIntrusive())); + YqSharedResources = NFq::TYqSharedResources::Cast(NFq::CreateYqSharedResourcesImpl({}, credFactory, MakeIntrusive())); RowDispatcher = Runtime.AllocateEdgeActor(); Coordinator1 = Runtime.AllocateEdgeActor(); Coordinator2 = Runtime.AllocateEdgeActor(); Coordinator3 = Runtime.AllocateEdgeActor(); + } + void Init(bool localMode = false) { NConfig::TRowDispatcherCoordinatorConfig config; config.SetCoordinationNodePath("row_dispatcher"); + config.SetLocalMode(localMode); auto& database = *config.MutableDatabase(); database.SetEndpoint(GetEnv("YDB_ENDPOINT")); database.SetDatabase(GetEnv("YDB_DATABASE")); @@ -42,7 +45,7 @@ class TFixture : public NUnitTest::TBaseFixture { Coordinator1, config, NKikimr::CreateYdbCredentialsProviderFactory, - yqSharedResources, + YqSharedResources, "/tenant", MakeIntrusive() ).release()); @@ -52,7 +55,7 @@ class TFixture : public NUnitTest::TBaseFixture { Coordinator2, config, NKikimr::CreateYdbCredentialsProviderFactory, - yqSharedResources, + YqSharedResources, "/tenant", MakeIntrusive() ).release()); @@ -62,7 +65,7 @@ class TFixture : public NUnitTest::TBaseFixture { Coordinator3, config, NKikimr::CreateYdbCredentialsProviderFactory, - yqSharedResources, + YqSharedResources, "/tenant", MakeIntrusive() ).release()); @@ -95,10 +98,12 @@ class TFixture : public NUnitTest::TBaseFixture { NActors::TActorId Coordinator2; NActors::TActorId Coordinator3; NActors::TActorId LeaderDetector; + TYqSharedResources::TPtr YqSharedResources; }; Y_UNIT_TEST_SUITE(LeaderElectionTests) { Y_UNIT_TEST_F(Test1, TFixture) { + Init(); auto coordinatorId1 = ExpectCoordinatorChanged(); auto coordinatorId2 = ExpectCoordinatorChanged(); @@ -134,7 +139,15 @@ Y_UNIT_TEST_SUITE(LeaderElectionTests) { auto coordinatorId6 = ExpectCoordinatorChanged(); UNIT_ASSERT(coordinatorId6 != coordinatorId4); } -} + Y_UNIT_TEST_F(TestLocalMode, TFixture) { + Init(true); + auto coordinatorId1 = ExpectCoordinatorChanged(); + auto coordinatorId2 = ExpectCoordinatorChanged(); + auto coordinatorId3 = ExpectCoordinatorChanged(); + TSet set {coordinatorId1, coordinatorId2, coordinatorId3}; + UNIT_ASSERT(set.size() == 3); + } } +} diff --git a/ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp index f5641e815539..71eb34c58716 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace { @@ -32,7 +33,8 @@ struct TTestActorFactory : public NFq::NRowDispatcher::IActorFactory { ui32 /*partitionId*/, NYdb::TDriver /*driver*/, std::shared_ptr /*credentialsProviderFactory*/, - const ::NMonitoring::TDynamicCounterPtr& /*counters*/) const override { + const ::NMonitoring::TDynamicCounterPtr& /*counters*/, + const NYql::IPqGateway::TPtr& /*pqGateway*/) const override { auto actorId = Runtime.AllocateEdgeActor(); ActorIds.push(actorId); return actorId; @@ -61,7 +63,6 @@ class TFixture : public NUnitTest::TBaseFixture { database.SetDatabase("YDB_DATABASE"); database.SetToken(""); - NConfig::TCommonConfig commonConfig; auto credFactory = NKikimr::CreateYdbCredentialsProviderFactory; auto yqSharedResources = NFq::TYqSharedResources::Cast(NFq::CreateYqSharedResourcesImpl({}, credFactory, MakeIntrusive())); @@ -71,16 +72,23 @@ class TFixture : public NUnitTest::TBaseFixture { ReadActorId1 = Runtime.AllocateEdgeActor(); ReadActorId2 = Runtime.AllocateEdgeActor(); TestActorFactory = MakeIntrusive(Runtime); + + NYql::TPqGatewayServices pqServices( + yqSharedResources->UserSpaceYdbDriver, + nullptr, + nullptr, + std::make_shared(), + nullptr); RowDispatcher = Runtime.Register(NewRowDispatcher( config, - commonConfig, NKikimr::CreateYdbCredentialsProviderFactory, yqSharedResources, credentialsFactory, "Tenant", TestActorFactory, - MakeIntrusive() + MakeIntrusive(), + CreatePqNativeGateway(pqServices) ).release()); Runtime.EnableScheduleForActor(RowDispatcher); diff --git a/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp index 65c24fcb85f1..0c94604792c8 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp @@ -1,3 +1,5 @@ +#include + #include #include @@ -10,6 +12,8 @@ #include #include +#include + namespace { using namespace NKikimr; @@ -31,6 +35,8 @@ class TFixture : public NUnitTest::TBaseFixture { Runtime.SetLogPriority(NKikimrServices::FQ_ROW_DISPATCHER, NLog::PRI_TRACE); Runtime.SetDispatchTimeout(TDuration::Seconds(5)); + NKikimr::EnableYDBBacktraceFormat(); + ReadActorId1 = Runtime.AllocateEdgeActor(); ReadActorId2 = Runtime.AllocateEdgeActor(); RowDispatcherActorId = Runtime.AllocateEdgeActor(); @@ -42,6 +48,16 @@ class TFixture : public NUnitTest::TBaseFixture { Config.SetSendStatusPeriodSec(2); Config.SetWithoutConsumer(true); + auto credFactory = NKikimr::CreateYdbCredentialsProviderFactory; + auto yqSharedResources = NFq::TYqSharedResources::Cast(NFq::CreateYqSharedResourcesImpl({}, credFactory, MakeIntrusive())); + + NYql::TPqGatewayServices pqServices( + yqSharedResources->UserSpaceYdbDriver, + nullptr, + nullptr, + std::make_shared(), + nullptr); + TopicSession = Runtime.Register(NewTopicSession( topicPath, Config, @@ -49,7 +65,8 @@ class TFixture : public NUnitTest::TBaseFixture { 0, Driver, CredentialsProviderFactory, - MakeIntrusive() + MakeIntrusive(), + CreatePqNativeGateway(pqServices) ).release()); Runtime.EnableScheduleForActor(TopicSession); @@ -97,22 +114,24 @@ class TFixture : public NUnitTest::TBaseFixture { } void ExpectMessageBatch(NActors::TActorId readActorId, const std::vector& expected) { + Runtime.Send(new IEventHandle(TopicSession, readActorId, new TEvRowDispatcher::TEvGetNextBatch())); + auto eventHolder = Runtime.GrabEdgeEvent(RowDispatcherActorId, TDuration::Seconds(GrabTimeoutSec)); UNIT_ASSERT(eventHolder.Get() != nullptr); - UNIT_ASSERT(eventHolder->Get()->ReadActorId == readActorId); - UNIT_ASSERT(expected.size() == eventHolder->Get()->Record.MessagesSize()); + UNIT_ASSERT_VALUES_EQUAL(eventHolder->Get()->ReadActorId, readActorId); + UNIT_ASSERT_VALUES_EQUAL(expected.size(), eventHolder->Get()->Record.MessagesSize()); for (size_t i = 0; i < expected.size(); ++i) { NFq::NRowDispatcherProto::TEvMessage message = eventHolder->Get()->Record.GetMessages(i); std::cerr << "message.GetJson() " << message.GetJson() << std::endl; - UNIT_ASSERT(expected[i] == message.GetJson()); + UNIT_ASSERT_VALUES_EQUAL(expected[i], message.GetJson()); } } void ExpectSessionError(NActors::TActorId readActorId, TString message) { auto eventHolder = Runtime.GrabEdgeEvent(RowDispatcherActorId, TDuration::Seconds(GrabTimeoutSec)); UNIT_ASSERT(eventHolder.Get() != nullptr); - UNIT_ASSERT(eventHolder->Get()->ReadActorId == readActorId); - UNIT_ASSERT(TString(eventHolder->Get()->Record.GetMessage()).Contains(message)); + UNIT_ASSERT_VALUES_EQUAL(eventHolder->Get()->ReadActorId, readActorId); + UNIT_ASSERT_STRING_CONTAINS(TString(eventHolder->Get()->Record.GetMessage()), message); } void ExpectNewDataArrived(TSet readActorIds) { @@ -129,7 +148,7 @@ class TFixture : public NUnitTest::TBaseFixture { Runtime.Send(new IEventHandle(TopicSession, readActorId, new TEvRowDispatcher::TEvGetNextBatch())); auto eventHolder = Runtime.GrabEdgeEvent(RowDispatcherActorId, TDuration::Seconds(GrabTimeoutSec)); UNIT_ASSERT(eventHolder.Get() != nullptr); - UNIT_ASSERT(eventHolder->Get()->ReadActorId == readActorId); + UNIT_ASSERT_VALUES_EQUAL(eventHolder->Get()->ReadActorId, readActorId); return eventHolder->Get()->Record.MessagesSize(); } @@ -162,8 +181,6 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { const std::vector data = { Json1 }; PQWrite(data, topicName); ExpectNewDataArrived({ReadActorId1, ReadActorId2}); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); ExpectMessageBatch(ReadActorId1, { Json1 }); ExpectMessageBatch(ReadActorId2, { Json1 }); @@ -204,8 +221,6 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { const std::vector data = { Json1 }; PQWrite(data, topicName); ExpectNewDataArrived({ReadActorId1, ReadActorId2}); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); ExpectMessageBatch(ReadActorId1, { Json1 }); ExpectMessageBatch(ReadActorId2, { Json1 }); @@ -223,7 +238,6 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { const std::vector data = { Json1 }; PQWrite(data, topicName); ExpectNewDataArrived({ReadActorId1}); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); ExpectMessageBatch(ReadActorId1, data); StartSession(ReadActorId2, source); @@ -232,9 +246,7 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { PQWrite(data2, topicName); ExpectNewDataArrived({ReadActorId1, ReadActorId2}); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); ExpectMessageBatch(ReadActorId1, data2); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); ExpectMessageBatch(ReadActorId2, data2); StopSession(ReadActorId1, source); @@ -253,21 +265,16 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { StartSession(ReadActorId2, source, 2); ExpectNewDataArrived({ReadActorId1, ReadActorId2}); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); std::vector expected1 = { Json2, Json3}; ExpectMessageBatch(ReadActorId1, expected1); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); std::vector expected2 = { Json3 }; ExpectMessageBatch(ReadActorId2, expected2); const std::vector data2 = { Json4 }; PQWrite(data2, topicName); ExpectNewDataArrived({ReadActorId1, ReadActorId2}); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); ExpectMessageBatch(ReadActorId1, data2); - - Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); ExpectMessageBatch(ReadActorId2, data2); StopSession(ReadActorId1, source); @@ -284,7 +291,7 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { const std::vector data = { "not json", "noch einmal / nicht json" }; PQWrite(data, topicName); - ExpectSessionError(ReadActorId1, "DB::ParsingException: Cannot parse input: expected '{' before: 'not json': (at row 1)"); + ExpectSessionError(ReadActorId1, "INCORRECT_TYPE: The JSON element does not have the requested type."); StopSession(ReadActorId1, source); } @@ -298,7 +305,6 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { const std::vector data = { Json1, Json2 }; // offset 0, 1 PQWrite(data, topicName); ExpectNewDataArrived({ReadActorId1}); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); ExpectMessageBatch(ReadActorId1, data); // Restart topic session. @@ -308,10 +314,7 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { PQWrite({ Json3 }, topicName); ExpectNewDataArrived({ReadActorId1}); - Runtime.Send(new IEventHandle(TopicSession, ReadActorId1, new TEvRowDispatcher::TEvGetNextBatch())); ExpectMessageBatch(ReadActorId1, { Json3 }); - - Runtime.Send(new IEventHandle(TopicSession, ReadActorId2, new TEvRowDispatcher::TEvGetNextBatch())); ExpectMessageBatch(ReadActorId2, { Json2, Json3 }); StopSession(ReadActorId1, source); @@ -372,6 +375,53 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { StopSession(ReadActorId1, source); StopSession(ReadActorId2, source); } + + Y_UNIT_TEST_F(TwoSessionsWithDifferentSchemes, TFixture) { + const TString topicName = "dif_schemes"; + PQCreateStream(topicName); + Init(topicName); + auto source1 = BuildSource(topicName); + auto source2 = BuildSource(topicName); + source2.AddColumns("field1"); + source2.AddColumnTypes("String"); + + StartSession(ReadActorId1, source1); + StartSession(ReadActorId2, source2); + + TString json1 = "{\"dt\":101,\"value\":\"value1\", \"field1\":\"field1\"}"; + TString json2 = "{\"dt\":102,\"value\":\"value2\", \"field1\":\"field2\"}"; + + Sleep(TDuration::Seconds(3)); + PQWrite({ json1, json2 }, topicName); + ExpectNewDataArrived({ReadActorId1, ReadActorId2}); + ExpectMessageBatch(ReadActorId1, { "{\"dt\":101,\"value\":\"value1\"}", "{\"dt\":102,\"value\":\"value2\"}" }); + ExpectMessageBatch(ReadActorId2, { "{\"dt\":101,\"field1\":\"field1\",\"value\":\"value1\"}", "{\"dt\":102,\"field1\":\"field2\",\"value\":\"value2\"}" }); + + auto source3 = BuildSource(topicName); + source3.AddColumns("field2"); + source3.AddColumnTypes("String"); + auto readActorId3 = Runtime.AllocateEdgeActor(); + StartSession(readActorId3, source3); + + TString json3 = "{\"dt\":103,\"value\":\"value3\", \"field1\":\"value1_field1\", \"field2\":\"value1_field2\"}"; + PQWrite({ json3 }, topicName); + ExpectNewDataArrived({ReadActorId1, ReadActorId2, readActorId3}); + ExpectMessageBatch(ReadActorId1, { "{\"dt\":103,\"value\":\"value3\"}" }); + ExpectMessageBatch(ReadActorId2, { "{\"dt\":103,\"field1\":\"value1_field1\",\"value\":\"value3\"}" }); + ExpectMessageBatch(readActorId3, { "{\"dt\":103,\"field2\":\"value1_field2\",\"value\":\"value3\"}" }); + + StopSession(ReadActorId1, source3); + StopSession(readActorId3, source3); + + TString json4 = "{\"dt\":104,\"value\":\"value4\", \"field1\":\"value2_field1\", \"field2\":\"value2_field2\"}"; + TString json5 = "{\"dt\":105,\"value\":\"value5\", \"field1\":\"value2_field1\", \"field2\":\"value2_field2\"}"; + PQWrite({ json4, json5 }, topicName); + ExpectNewDataArrived({ReadActorId2}); + ExpectMessageBatch(ReadActorId2, { "{\"dt\":104,\"field1\":\"value2_field1\",\"value\":\"value4\"}", "{\"dt\":105,\"field1\":\"value2_field1\",\"value\":\"value5\"}" }); + + StopSession(ReadActorId1, source1); + StopSession(ReadActorId2, source2); + } } } diff --git a/ydb/core/fq/libs/row_dispatcher/ut/ya.make b/ydb/core/fq/libs/row_dispatcher/ut/ya.make index 25242d092f28..bb66ec57798f 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/ya.make +++ b/ydb/core/fq/libs/row_dispatcher/ut/ya.make @@ -20,7 +20,6 @@ PEERDIR( ydb/library/yql/udfs/common/yson2 ydb/tests/fq/pq_async_io ydb/library/yql/sql/pg_dummy - ydb/library/yql/udfs/common/clickhouse/client ) SIZE(MEDIUM) diff --git a/ydb/core/fq/libs/row_dispatcher/ya.make b/ydb/core/fq/libs/row_dispatcher/ya.make index f1f036d20dc0..44be15461ee6 100644 --- a/ydb/core/fq/libs/row_dispatcher/ya.make +++ b/ydb/core/fq/libs/row_dispatcher/ya.make @@ -13,6 +13,7 @@ SRCS( PEERDIR( contrib/libs/fmt + contrib/libs/simdjson ydb/core/fq/libs/actors/logging ydb/core/fq/libs/config/protos ydb/core/fq/libs/control_plane_storage @@ -34,6 +35,8 @@ YQL_LAST_ABI_VERSION() END() -RECURSE_FOR_TESTS( - ut -) +IF(NOT EXPORT_CMAKE) + RECURSE_FOR_TESTS( + ut + ) +ENDIF() diff --git a/ydb/library/yql/providers/dq/local_gateway/yql_dq_gateway_local.cpp b/ydb/library/yql/providers/dq/local_gateway/yql_dq_gateway_local.cpp index 8b87f8aebcd0..feaaaaac8d52 100644 --- a/ydb/library/yql/providers/dq/local_gateway/yql_dq_gateway_local.cpp +++ b/ydb/library/yql/providers/dq/local_gateway/yql_dq_gateway_local.cpp @@ -31,7 +31,8 @@ class TLocalServiceHolder { NDq::IDqAsyncIoFactory::TPtr asyncIoFactory, int threads, IMetricsRegistryPtr metricsRegistry, const std::function& metricsPusherFactory, - bool withSpilling) + bool withSpilling, + TVector>&& additionalLocalServices) : MetricsRegistry(metricsRegistry ? metricsRegistry : CreateMetricsRegistry(GetSensorsGroupFor(NSensorComponent::kDq)) @@ -90,6 +91,9 @@ class TLocalServiceHolder { NDq::MakeDqLocalFileSpillingServiceID(nodeId), TActorSetupCmd(spillingActor, TMailboxType::Simple, 0)); } + for (auto& [actorId, setupCmd] : additionalLocalServices) { + ServiceNode->AddLocalService(actorId, std::move(setupCmd)); + } auto statsCollector = CreateStatsCollector(1, *ServiceNode->GetSetup(), MetricsRegistry->GetSensors()); @@ -249,7 +253,8 @@ THolder CreateLocalServiceHolder(const NKikimr::NMiniKQL::I NBus::TBindResult interconnectPort, NBus::TBindResult grpcPort, NDq::IDqAsyncIoFactory::TPtr asyncIoFactory, int threads, IMetricsRegistryPtr metricsRegistry, - const std::function& metricsPusherFactory, bool withSpilling) + const std::function& metricsPusherFactory, bool withSpilling, + TVector>&& additionalLocalServices) { return MakeHolder(functionRegistry, compFactory, @@ -261,7 +266,8 @@ THolder CreateLocalServiceHolder(const NKikimr::NMiniKQL::I threads, metricsRegistry, metricsPusherFactory, - withSpilling); + withSpilling, + std::move(additionalLocalServices)); } TIntrusivePtr CreateLocalDqGateway(const NKikimr::NMiniKQL::IFunctionRegistry* functionRegistry, @@ -269,7 +275,8 @@ TIntrusivePtr CreateLocalDqGateway(const NKikimr::NMiniKQL::IFunctio TTaskTransformFactory taskTransformFactory, const TDqTaskPreprocessorFactoryCollection& dqTaskPreprocessorFactories, bool withSpilling, NDq::IDqAsyncIoFactory::TPtr asyncIoFactory, int threads, IMetricsRegistryPtr metricsRegistry, - const std::function& metricsPusherFactory) + const std::function& metricsPusherFactory, + TVector>&& additionalLocalServices) { int startPort = 31337; TRangeWalker portWalker(startPort, startPort+100); @@ -288,7 +295,8 @@ TIntrusivePtr CreateLocalDqGateway(const NKikimr::NMiniKQL::IFunctio threads, metricsRegistry, metricsPusherFactory, - withSpilling), + withSpilling, + std::move(additionalLocalServices)), CreateDqGateway("[::1]", grpcPort.Addr.GetPort())); } diff --git a/ydb/library/yql/providers/dq/local_gateway/yql_dq_gateway_local.h b/ydb/library/yql/providers/dq/local_gateway/yql_dq_gateway_local.h index 91329aab8dc6..77aafdbd61e7 100644 --- a/ydb/library/yql/providers/dq/local_gateway/yql_dq_gateway_local.h +++ b/ydb/library/yql/providers/dq/local_gateway/yql_dq_gateway_local.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -17,6 +18,7 @@ TIntrusivePtr CreateLocalDqGateway(const NKikimr::NMiniKQL::IFunctio bool withSpilling, NDq::IDqAsyncIoFactory::TPtr = nullptr, int threads = 16, IMetricsRegistryPtr metricsRegistry = {}, - const std::function& metricsPusherFactory = {}); + const std::function& metricsPusherFactory = {}, + TVector>&& additionalLocalServices = {}); } // namespace NYql diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp b/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp index 74fdb0949230..91ec056ff986 100644 --- a/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include @@ -94,16 +94,19 @@ struct TEvPrivate { enum EEv : ui32 { EvBegin = EventSpaceBegin(NActors::TEvents::ES_PRIVATE), EvPrintState = EvBegin + 20, + EvProcessState = EvBegin + 21, EvEnd }; static_assert(EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE), "expect EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE)"); struct TEvPrintState : public NActors::TEventLocal {}; + struct TEvProcessState : public NActors::TEventLocal {}; }; -ui64 PrintStatePeriodSec = 60; - class TDqPqRdReadActor : public NActors::TActor, public NYql::NDq::NInternal::TDqPqReadActorBase { -public: + + const ui64 PrintStatePeriodSec = 60; + const ui64 ProcessStatePeriodSec = 2; + using TDebugOffsets = TMaybe>; struct TReadyBatch { @@ -136,6 +139,7 @@ class TDqPqRdReadActor : public NActors::TActor, public NYql:: ui64 CoordinatorRequestCookie = 0; TRowDispatcherReadActorMetrics Metrics; bool SchedulePrintStatePeriod = false; + bool ProcessStateScheduled = false; struct SessionInfo { enum class ESessionStatus { @@ -146,8 +150,10 @@ class TDqPqRdReadActor : public NActors::TActor, public NYql:: const TTxId& txId, const NActors::TActorId selfId, TActorId rowDispatcherActorId, + ui64 partitionId, ui64 eventQueueId) - : RowDispatcherActorId(rowDispatcherActorId) { + : RowDispatcherActorId(rowDispatcherActorId) + , PartitionId(partitionId) { EventsQueue.Init(txId, selfId, selfId, eventQueueId, /* KeepAlive */ true); EventsQueue.OnNewRecipientId(rowDispatcherActorId); } @@ -156,11 +162,15 @@ class TDqPqRdReadActor : public NActors::TActor, public NYql:: ui64 NextOffset = 0; bool IsWaitingRowDispatcherResponse = false; NYql::NDq::TRetryEventsQueue EventsQueue; - bool NewDataArrived = false; + bool HasPendingData = false; TActorId RowDispatcherActorId; + ui64 PartitionId; }; TMap Sessions; + const THolderFactory& HolderFactory; + const i64 MaxBufferSize; + i64 ReadyBufferSizeBytes = 0; public: TDqPqRdReadActor( @@ -174,7 +184,8 @@ class TDqPqRdReadActor : public NActors::TActor, public NYql:: const NActors::TActorId& computeActorId, const NActors::TActorId& localRowDispatcherActorId, const TString& token, - const ::NMonitoring::TDynamicCounterPtr& counters); + const ::NMonitoring::TDynamicCounterPtr& counters, + i64 bufferSize); void Handle(NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr& ev); void Handle(NFq::TEvRowDispatcher::TEvCoordinatorResult::TPtr& ev); @@ -193,6 +204,7 @@ class TDqPqRdReadActor : public NActors::TActor, public NYql:: void Handle(NActors::TEvents::TEvPong::TPtr& ev); void Handle(const NActors::TEvents::TEvPing::TPtr&); void Handle(TEvPrivate::TEvPrintState::TPtr&); + void Handle(TEvPrivate::TEvProcessState::TPtr&); STRICT_STFUNC(StateFunc, { hFunc(NFq::TEvRowDispatcher::TEvCoordinatorChanged, Handle); @@ -212,6 +224,7 @@ class TDqPqRdReadActor : public NActors::TActor, public NYql:: hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed, Handle); hFunc(NActors::TEvents::TEvPing, Handle); hFunc(TEvPrivate::TEvPrintState, Handle); + hFunc(TEvPrivate::TEvProcessState, Handle); }) static constexpr char ActorName[] = "DQ_PQ_READ_ACTOR"; @@ -224,8 +237,9 @@ class TDqPqRdReadActor : public NActors::TActor, public NYql:: void ProcessState(); void Stop(const TString& message); void StopSessions(); - void ReInit(); + void ReInit(const TString& reason); void PrintInternalState(); + void TrySendGetNextBatch(SessionInfo& sessionInfo); }; TDqPqRdReadActor::TDqPqRdReadActor( @@ -233,18 +247,21 @@ TDqPqRdReadActor::TDqPqRdReadActor( TCollectStatsLevel statsLevel, const TTxId& txId, ui64 taskId, - const THolderFactory& /*holderFactory*/, + const THolderFactory& holderFactory, NPq::NProto::TDqPqTopicSource&& sourceParams, NPq::NProto::TDqReadTaskParams&& readParams, const NActors::TActorId& computeActorId, const NActors::TActorId& localRowDispatcherActorId, const TString& token, - const ::NMonitoring::TDynamicCounterPtr& counters) + const ::NMonitoring::TDynamicCounterPtr& counters, + i64 bufferSize) : TActor(&TDqPqRdReadActor::StateFunc) , TDqPqReadActorBase(inputIndex, taskId, this->SelfId(), txId, std::move(sourceParams), std::move(readParams), computeActorId) , Token(token) , LocalRowDispatcherActorId(localRowDispatcherActorId) , Metrics(txId, taskId, counters) + , HolderFactory(holderFactory) + , MaxBufferSize(bufferSize) { MetadataFields.reserve(SourceParams.MetadataFieldsSize()); TPqMetaExtractor fieldsExtractor; @@ -253,7 +270,7 @@ TDqPqRdReadActor::TDqPqRdReadActor( } IngressStats.Level = statsLevel; - SRC_LOG_D("Start read actor, local row dispatcher " << LocalRowDispatcherActorId.ToString()); + SRC_LOG_D("Start read actor, local row dispatcher " << LocalRowDispatcherActorId.ToString() << ", metadatafields: " << JoinSeq(',', SourceParams.GetMetadataFields())); } void TDqPqRdReadActor::ProcessState() { @@ -262,6 +279,10 @@ void TDqPqRdReadActor::ProcessState() { if (!ReadyBuffer.empty()) { return; } + if (!ProcessStateScheduled) { + ProcessStateScheduled = true; + Schedule(TDuration::Seconds(ProcessStatePeriodSec), new TEvPrivate::TEvProcessState()); + } if (!CoordinatorActorId) { SRC_LOG_D("Send TEvCoordinatorChangesSubscribe to local row dispatcher, self id " << SelfId()); Send(LocalRowDispatcherActorId, new NFq::TEvRowDispatcher::TEvCoordinatorChangesSubscribe()); @@ -363,7 +384,6 @@ i64 TDqPqRdReadActor::GetAsyncInputData(NKikimr::NMiniKQL::TUnboxedValueBatch& b buffer.clear(); do { auto& readyBatch = ReadyBuffer.front(); - SRC_LOG_T("Return " << readyBatch.Data.size() << " items"); for (const auto& message : readyBatch.Data) { auto [item, size] = CreateItem(message); @@ -371,15 +391,21 @@ i64 TDqPqRdReadActor::GetAsyncInputData(NKikimr::NMiniKQL::TUnboxedValueBatch& b } usedSpace += readyBatch.UsedSpace; freeSpace -= readyBatch.UsedSpace; - SRC_LOG_T("usedSpace " << usedSpace); - SRC_LOG_T("freeSpace " << freeSpace); - TPartitionKey partitionKey{TString{}, readyBatch.PartitionId}; PartitionToOffset[partitionKey] = readyBatch.NextOffset; SRC_LOG_T("NextOffset " << readyBatch.NextOffset); ReadyBuffer.pop(); } while (freeSpace > 0 && !ReadyBuffer.empty()); + ReadyBufferSizeBytes -= usedSpace; + SRC_LOG_T("Return " << buffer.RowCount() << " rows, buffer size " << ReadyBufferSizeBytes << ", free space " << freeSpace << ", result size " << usedSpace); + + if (!ReadyBuffer.empty()) { + Send(ComputeActorId, new TEvNewAsyncInputDataArrived(InputIndex)); + } + for (auto& [partitionId, sessionInfo] : Sessions) { + TrySendGetNextBatch(sessionInfo); + } ProcessState(); return usedSpace; } @@ -401,7 +427,12 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvStartSessionAck::TPtr& e ui64 partitionId = ev->Get()->Record.GetConsumer().GetPartitionId(); auto sessionIt = Sessions.find(partitionId); - YQL_ENSURE(sessionIt != Sessions.end(), "Unknown partition id"); + if (sessionIt == Sessions.end()) { + SRC_LOG_W("Ignore TEvStartSessionAck from " << ev->Sender << ", seqNo " << meta.GetSeqNo() + << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo() << ", PartitionId " << partitionId); + YQL_ENSURE(State != EState::STARTED); + return; + } auto& sessionInfo = sessionIt->second; if (!sessionInfo.EventsQueue.OnEventReceived(ev)) { SRC_LOG_W("Wrong seq num ignore message, seqNo " << meta.GetSeqNo()); @@ -415,7 +446,12 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvSessionError::TPtr& ev) ui64 partitionId = ev->Get()->Record.GetPartitionId(); auto sessionIt = Sessions.find(partitionId); - YQL_ENSURE(sessionIt != Sessions.end(), "Unknown partition id"); + if (sessionIt == Sessions.end()) { + SRC_LOG_W("Ignore TEvSessionError from " << ev->Sender << ", seqNo " << meta.GetSeqNo() + << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo() << ", PartitionId " << partitionId); + YQL_ENSURE(State != EState::STARTED); + return; + } auto& sessionInfo = sessionIt->second; if (!sessionInfo.EventsQueue.OnEventReceived(ev)) { @@ -431,7 +467,12 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvStatus::TPtr& ev) { ui64 partitionId = ev->Get()->Record.GetPartitionId(); auto sessionIt = Sessions.find(partitionId); - YQL_ENSURE(sessionIt != Sessions.end(), "Unknown partition id"); + if (sessionIt == Sessions.end()) { + SRC_LOG_W("Ignore TEvStatus from " << ev->Sender << ", seqNo " << meta.GetSeqNo() + << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo() << ", PartitionId " << partitionId); + YQL_ENSURE(State != EState::STARTED); + return; + } auto& sessionInfo = sessionIt->second; if (!sessionInfo.EventsQueue.OnEventReceived(ev)) { @@ -452,7 +493,9 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvNewDataArrived::TPtr& ev ui64 partitionId = ev->Get()->Record.GetPartitionId(); auto sessionIt = Sessions.find(partitionId); if (sessionIt == Sessions.end()) { - Stop("Internal error: unknown partition id " + ToString(partitionId)); + SRC_LOG_W("Ignore TEvNewDataArrived from " << ev->Sender << ", seqNo " << meta.GetSeqNo() + << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo() << ", PartitionId " << partitionId); + YQL_ENSURE(State != EState::STARTED); return; } @@ -461,11 +504,8 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvNewDataArrived::TPtr& ev SRC_LOG_W("Wrong seq num ignore message, seqNo " << meta.GetSeqNo()); return; } - sessionInfo.NewDataArrived = true; - Metrics.InFlyGetNextBatch->Inc(); - auto event = std::make_unique(); - event->Record.SetPartitionId(partitionId); - sessionInfo.EventsQueue.Send(event.release()); + sessionInfo.HasPendingData = true; + TrySendGetNextBatch(sessionInfo); } void TDqPqRdReadActor::Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvRetry::TPtr& ev) { @@ -512,20 +552,18 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr } CoordinatorActorId = ev->Get()->CoordinatorActorId; - SRC_LOG_I("Coordinator is changed, reinit all sessions"); - ReInit(); + ReInit("Coordinator is changed"); ProcessState(); } -void TDqPqRdReadActor::ReInit() { - SRC_LOG_I("ReInit state"); +void TDqPqRdReadActor::ReInit(const TString& reason) { + SRC_LOG_I("ReInit state, reason " << reason); StopSessions(); Sessions.clear(); State = EState::INIT; if (!ReadyBuffer.empty()) { Send(ComputeActorId, new TEvNewAsyncInputDataArrived(InputIndex)); } - ProcessState(); } void TDqPqRdReadActor::Stop(const TString& message) { @@ -551,7 +589,7 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvCoordinatorResult::TPtr& Sessions.emplace( std::piecewise_construct, std::forward_as_tuple(partitionId), - std::forward_as_tuple(TxId, SelfId(), rowDispatcherActorId, partitionId)); + std::forward_as_tuple(TxId, SelfId(), rowDispatcherActorId, partitionId, partitionId)); } } } @@ -582,8 +620,7 @@ void TDqPqRdReadActor::Handle(NActors::TEvents::TEvUndelivered::TPtr& ev) { } if (CoordinatorActorId && *CoordinatorActorId == ev->Sender) { - SRC_LOG_D("TEvUndelivered to coordinator, reinit"); - ReInit(); + ReInit("TEvUndelivered to coordinator"); } } @@ -591,15 +628,15 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev) const NYql::NDqProto::TMessageTransportMeta& meta = ev->Get()->Record.GetTransportMeta(); SRC_LOG_T("TEvMessageBatch from " << ev->Sender << ", seqNo " << meta.GetSeqNo() << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo()); ui64 partitionId = ev->Get()->Record.GetPartitionId(); - YQL_ENSURE(Sessions.count(partitionId), "Unknown partition id"); - auto it = Sessions.find(partitionId); - if (it == Sessions.end()) { - Stop("Wrong session data"); - return; + auto sessionIt = Sessions.find(partitionId); + if (sessionIt == Sessions.end()) { + SRC_LOG_W("Ignore TEvMessageBatch from " << ev->Sender << ", seqNo " << meta.GetSeqNo() + << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo() << ", PartitionId " << partitionId); + YQL_ENSURE(State != EState::STARTED); } - Metrics.InFlyGetNextBatch->Dec(); - auto& sessionInfo = it->second; + Metrics.InFlyGetNextBatch->Set(0); + auto& sessionInfo = sessionIt->second; if (!sessionInfo.EventsQueue.OnEventReceived(ev)) { SRC_LOG_W("Wrong seq num ignore message, seqNo " << meta.GetSeqNo()); return; @@ -611,11 +648,12 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev) for (const auto& message : ev->Get()->Record.GetMessages()) { SRC_LOG_T("Json: " << message.GetJson()); activeBatch.Data.emplace_back(message.GetJson()); - activeBatch.UsedSpace += message.GetJson().size(); sessionInfo.NextOffset = message.GetOffset() + 1; bytes += message.GetJson().size(); SRC_LOG_T("TEvMessageBatch NextOffset " << sessionInfo.NextOffset); } + activeBatch.UsedSpace = bytes; + ReadyBufferSizeBytes += bytes; IngressStats.Bytes += bytes; IngressStats.Chunks++; activeBatch.NextOffset = ev->Get()->Record.GetNextMessageOffset(); @@ -625,14 +663,26 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev) std::pair TDqPqRdReadActor::CreateItem(const TString& data) { i64 usedSpace = 0; NUdf::TUnboxedValuePod item; - item = NKikimr::NMiniKQL::MakeString(NUdf::TStringRef(data.Data(), data.Size())); - usedSpace += data.Size(); + if (MetadataFields.empty()) { + item = NKikimr::NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); + usedSpace += data.size(); + return std::make_pair(item, usedSpace); + } + + NUdf::TUnboxedValue* itemPtr; + item = HolderFactory.CreateDirectArrayHolder(MetadataFields.size() + 1, itemPtr); + *(itemPtr++) = NKikimr::NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); + usedSpace += data.size(); + + for ([[maybe_unused]] const auto& [name, extractor] : MetadataFields) { + auto ub = NYql::NUdf::TUnboxedValuePod(0); // TODO: use real values + *(itemPtr++) = std::move(ub); + } return std::make_pair(item, usedSpace); } void TDqPqRdReadActor::Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed::TPtr& ev) { - SRC_LOG_D("Session closed, event queue id " << ev->Get()->EventQueueId); - ReInit(); + ReInit(TStringBuilder() << "Session closed, event queue id " << ev->Get()->EventQueueId); } void TDqPqRdReadActor::Handle(NActors::TEvents::TEvPong::TPtr& ev) { @@ -654,6 +704,25 @@ void TDqPqRdReadActor::PrintInternalState() { SRC_LOG_D(str.Str()); } +void TDqPqRdReadActor::Handle(TEvPrivate::TEvProcessState::TPtr&) { + Schedule(TDuration::Seconds(ProcessStatePeriodSec), new TEvPrivate::TEvProcessState()); + ProcessState(); +} + +void TDqPqRdReadActor::TrySendGetNextBatch(SessionInfo& sessionInfo) { + if (!sessionInfo.HasPendingData) { + return; + } + if (ReadyBufferSizeBytes > MaxBufferSize) { + return; + } + Metrics.InFlyGetNextBatch->Inc(); + auto event = std::make_unique(); + sessionInfo.HasPendingData = false; + event->Record.SetPartitionId(sessionInfo.PartitionId); + sessionInfo.EventsQueue.Send(event.release()); +} + std::pair CreateDqPqRdReadActor( NPq::NProto::TDqPqTopicSource&& settings, ui64 inputIndex, @@ -666,7 +735,7 @@ std::pair CreateDqPqRdReadActor( const NActors::TActorId& localRowDispatcherActorId, const NKikimr::NMiniKQL::THolderFactory& holderFactory, const ::NMonitoring::TDynamicCounterPtr& counters, - i64 /*bufferSize*/) // TODO + i64 bufferSize) { auto taskParamsIt = taskParams.find("pq"); YQL_ENSURE(taskParamsIt != taskParams.end(), "Failed to get pq task params"); @@ -688,7 +757,8 @@ std::pair CreateDqPqRdReadActor( computeActorId, localRowDispatcherActorId, token, - counters + counters, + bufferSize ); return {actor, actor}; diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.h b/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.h index d1131fd7a76e..f362fa10bd8b 100644 --- a/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.h +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.h @@ -20,7 +20,7 @@ namespace NYql::NDq { class TDqAsyncIoFactory; -const i64 PQRdReadDefaultFreeSpace = 16_MB; +const i64 PQRdReadDefaultFreeSpace = 256_MB; std::pair CreateDqPqRdReadActor( NPq::NProto::TDqPqTopicSource&& settings, diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp index b57f80c8478c..9d9d5a03da5a 100644 --- a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.cpp @@ -128,7 +128,8 @@ class TDqPqReadActor : public NActors::TActor, public NYql::NDq: std::shared_ptr credentialsProviderFactory, const NActors::TActorId& computeActorId, const ::NMonitoring::TDynamicCounterPtr& counters, - i64 bufferSize) + i64 bufferSize, + const IPqGateway::TPtr& pqGateway) : TActor(&TDqPqReadActor::StateFunc) , TDqPqReadActorBase(inputIndex, taskId, this->SelfId(), txId, std::move(sourceParams), std::move(readParams), computeActorId) , Metrics(txId, taskId, counters) @@ -136,6 +137,7 @@ class TDqPqReadActor : public NActors::TActor, public NYql::NDq: , HolderFactory(holderFactory) , Driver(std::move(driver)) , CredentialsProviderFactory(std::move(credentialsProviderFactory)) + , PqGateway(pqGateway) { MetadataFields.reserve(SourceParams.MetadataFieldsSize()); TPqMetaExtractor fieldsExtractor; @@ -185,9 +187,9 @@ class TDqPqReadActor : public NActors::TActor, public NYql::NDq: } } - NYdb::NTopic::TTopicClient& GetTopicClient() { + ITopicClient& GetTopicClient() { if (!TopicClient) { - TopicClient = std::make_unique(Driver, GetTopicClientSettings()); + TopicClient = PqGateway->GetTopicClient(Driver, GetTopicClientSettings()); } return *TopicClient; } @@ -229,7 +231,7 @@ class TDqPqReadActor : public NActors::TActor, public NYql::NDq: ReadSession->Close(TDuration::Zero()); ReadSession.reset(); } - TopicClient.reset(); + TopicClient.Reset(); TActor::PassAway(); } @@ -568,7 +570,7 @@ class TDqPqReadActor : public NActors::TActor, public NYql::NDq: const THolderFactory& HolderFactory; NYdb::TDriver Driver; std::shared_ptr CredentialsProviderFactory; - std::unique_ptr TopicClient; + ITopicClient::TPtr TopicClient; std::shared_ptr ReadSession; NThreading::TFuture EventFuture; std::queue> DeferredCommits; @@ -578,6 +580,7 @@ class TDqPqReadActor : public NActors::TActor, public NYql::NDq: std::queue ReadyBuffer; TMaybe> WatermarkTracker; TMaybe NextIdlenesCheckAt; + IPqGateway::TPtr PqGateway; }; std::pair CreateDqPqReadActor( @@ -593,6 +596,7 @@ std::pair CreateDqPqReadActor( const NActors::TActorId& computeActorId, const NKikimr::NMiniKQL::THolderFactory& holderFactory, const ::NMonitoring::TDynamicCounterPtr& counters, + IPqGateway::TPtr pqGateway, i64 bufferSize ) { @@ -618,15 +622,16 @@ std::pair CreateDqPqReadActor( CreateCredentialsProviderFactoryForStructuredToken(credentialsFactory, token, addBearerToToken), computeActorId, counters, - bufferSize + bufferSize, + pqGateway ); return {actor, actor}; } -void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driver, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const ::NMonitoring::TDynamicCounterPtr& counters) { +void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driver, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const IPqGateway::TPtr& pqGateway, const ::NMonitoring::TDynamicCounterPtr& counters) { factory.RegisterSource("PqSource", - [driver = std::move(driver), credentialsFactory = std::move(credentialsFactory), counters]( + [driver = std::move(driver), credentialsFactory = std::move(credentialsFactory), counters, pqGateway]( NPq::NProto::TDqPqTopicSource&& settings, IDqAsyncIoFactory::TSourceArguments&& args) { @@ -646,6 +651,7 @@ void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driv args.ComputeActorId, args.HolderFactory, counters, + pqGateway, PQReadDefaultFreeSpace); } diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.h b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.h index 161e9e5eba57..201ce476dda0 100644 --- a/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.h +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_read_actor.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -35,9 +36,10 @@ std::pair CreateDqPqReadActor( const NActors::TActorId& computeActorId, const NKikimr::NMiniKQL::THolderFactory& holderFactory, const ::NMonitoring::TDynamicCounterPtr& counters, + IPqGateway::TPtr pqGateway, i64 bufferSize = PQReadDefaultFreeSpace ); -void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driver, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const ::NMonitoring::TDynamicCounterPtr& counters = MakeIntrusive<::NMonitoring::TDynamicCounters>()); +void RegisterDqPqReadActorFactory(TDqAsyncIoFactory& factory, NYdb::TDriver driver, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, const IPqGateway::TPtr& pqGateway, const ::NMonitoring::TDynamicCounterPtr& counters = MakeIntrusive<::NMonitoring::TDynamicCounters>()); } // namespace NYql::NDq diff --git a/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json b/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json index 8a8f172d307f..0b178695aaeb 100644 --- a/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json +++ b/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json @@ -71,8 +71,7 @@ {"Index": 1, "Name": "Columns", "Type": "TExprBase"}, {"Index": 2, "Name": "Settings", "Type": "TCoNameValueTupleList"}, {"Index": 3, "Name": "Token", "Type": "TCoSecureParam"}, - {"Index": 4, "Name": "FilterPredicate", "Type": "TCoLambda"}, - {"Index": 5, "Name": "ColumnTypes", "Type": "TExprBase"} + {"Index": 4, "Name": "FilterPredicate", "Type": "TCoLambda"} ] }, { diff --git a/ydb/library/yql/providers/pq/gateway/dummy/ya.make b/ydb/library/yql/providers/pq/gateway/dummy/ya.make index fa37f1376e2d..227f6a0672f5 100644 --- a/ydb/library/yql/providers/pq/gateway/dummy/ya.make +++ b/ydb/library/yql/providers/pq/gateway/dummy/ya.make @@ -2,6 +2,7 @@ LIBRARY() SRCS( yql_pq_dummy_gateway.cpp + yql_pq_file_topic_client.cpp ) PEERDIR( diff --git a/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_dummy_gateway.cpp b/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_dummy_gateway.cpp index ea1eb61432ba..f1575567e468 100644 --- a/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_dummy_gateway.cpp +++ b/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_dummy_gateway.cpp @@ -1,4 +1,5 @@ #include "yql_pq_dummy_gateway.h" +#include "yql_pq_file_topic_client.h" #include #include @@ -59,6 +60,14 @@ TDummyPqGateway& TDummyPqGateway::AddDummyTopic(const TDummyTopic& topic) { } } +IPqGateway::TPtr CreatePqFileGateway() { + return MakeIntrusive(); +} + +ITopicClient::TPtr TDummyPqGateway::GetTopicClient(const NYdb::TDriver&, const NYdb::NTopic::TTopicClientSettings&) { + return MakeIntrusive(Topics); +} + void TDummyPqGateway::UpdateClusterConfigs( const TString& clusterName, const TString& endpoint, diff --git a/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_dummy_gateway.h b/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_dummy_gateway.h index fe838c5b1ff4..84a394531ee0 100644 --- a/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_dummy_gateway.h +++ b/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_dummy_gateway.h @@ -9,9 +9,10 @@ namespace NYql { struct TDummyTopic { - TDummyTopic(const TString& cluster, const TString& path) + TDummyTopic(const TString& cluster, const TString& path, const TMaybe& filePath = {}) : Cluster(cluster) , Path(path) + , FilePath(filePath) { } @@ -22,6 +23,7 @@ struct TDummyTopic { TString Cluster; TString Path; + TMaybe FilePath; size_t PartitionsCount = 1; }; @@ -29,8 +31,8 @@ struct TDummyTopic { class TDummyPqGateway : public IPqGateway { public: TDummyPqGateway& AddDummyTopic(const TDummyTopic& topic); + ~TDummyPqGateway() {} -public: NThreading::TFuture OpenSession(const TString& sessionId, const TString& username) override; NThreading::TFuture CloseSession(const TString& sessionId) override; @@ -54,11 +56,17 @@ class TDummyPqGateway : public IPqGateway { const TString& endpoint, const TString& database, bool secure) override; + + ITopicClient::TPtr GetTopicClient(const NYdb::TDriver& driver, const NYdb::NTopic::TTopicClientSettings& settings) override; + using TClusterNPath = std::pair; private: mutable TMutex Mutex; - THashMap, TDummyTopic> Topics; + THashMap Topics; + THashSet OpenedSessions; }; +IPqGateway::TPtr CreatePqFileGateway(); + } // namespace NYql diff --git a/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_file_topic_client.cpp b/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_file_topic_client.cpp new file mode 100644 index 000000000000..55f284b8f865 --- /dev/null +++ b/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_file_topic_client.cpp @@ -0,0 +1,314 @@ +#include "yql_pq_file_topic_client.h" +#include "util/stream/file.h" + +#include + +#include +#include + +#include + +namespace NYql { + +class TBlockingEQueue { +public: + TBlockingEQueue(size_t maxSize):MaxSize_(maxSize) { + } + void Push(NYdb::NTopic::TReadSessionEvent::TEvent&& e, size_t size) { + with_lock(Mutex_) { + CanPush_.WaitI(Mutex_, [this] () {return Stopped_ || Size_ < MaxSize_;}); + Events_.emplace_back(std::move(e), size ); + Size_ += size; + } + CanPop_.BroadCast(); + } + + void BlockUntilEvent() { + with_lock(Mutex_) { + CanPop_.WaitI(Mutex_, [this] () {return Stopped_ || !Events_.empty();}); + } + } + + TMaybe Pop(bool block) { + with_lock(Mutex_) { + if (block) { + CanPop_.WaitI(Mutex_, [this] () {return CanPopPredicate();}); + } else { + if (!CanPopPredicate()) { + return {}; + } + } + auto [front, size] = std::move(Events_.front()); + Events_.pop_front(); + Size_ -= size; + if (Size_ < MaxSize_) { + CanPush_.BroadCast(); + } + return front; + } + } + + void Stop() { + with_lock(Mutex_) { + Stopped_ = true; + CanPop_.BroadCast(); + CanPush_.BroadCast(); + } + } + + bool IsStopped() { + with_lock(Mutex_) { + return Stopped_; + } + } + +private: + bool CanPopPredicate() { + return !Events_.empty() && !Stopped_; + } + + size_t MaxSize_; + size_t Size_ = 0; + TDeque> Events_; + bool Stopped_ = false; + TMutex Mutex_; + TCondVar CanPop_; + TCondVar CanPush_; +}; + +class TFileTopicReadSession : public NYdb::NTopic::IReadSession { + +constexpr static auto FILE_POLL_PERIOD = TDuration::MilliSeconds(5); + +public: + TFileTopicReadSession(TFile file, NYdb::NTopic::TPartitionSession::TPtr session, const TString& producerId = ""): + File_(std::move(file)), Session_(std::move(session)), ProducerId_(producerId), + FilePoller_([this] () { + PollFileForChanges(); + }), Counters_() + { + Pool_.Start(1); + } + + NThreading::TFuture WaitEvent() override { + return NThreading::Async([this] () { + EventsQ_.BlockUntilEvent(); + return NThreading::MakeFuture(); + }, Pool_); + } + + TVector GetEvents(bool block, TMaybe maxEventsCount, size_t maxByteSize) override { + // TODO + Y_UNUSED(maxByteSize); + + TVector res; + for (auto event = EventsQ_.Pop(block); !event.Empty() && res.size() <= maxEventsCount.GetOrElse(std::numeric_limits::max()); event = EventsQ_.Pop(/*block=*/ false)) { + res.push_back(*event); + } + return res; + } + + TVector GetEvents(const NYdb::NTopic::TReadSessionGetEventSettings& settings) override { + return GetEvents(settings.Block_, settings.MaxEventsCount_, settings.MaxByteSize_); + } + + TMaybe GetEvent(bool block, size_t maxByteSize) override { + // TODO + Y_UNUSED(maxByteSize); + + return EventsQ_.Pop(block); + } + + TMaybe GetEvent(const NYdb::NTopic::TReadSessionGetEventSettings& settings) override { + return GetEvent(settings.Block_, settings.MaxByteSize_); + } + + bool Close(TDuration timeout = TDuration::Max()) override { + Y_UNUSED(timeout); + // TOOD send TSessionClosedEvent + EventsQ_.Stop(); + Pool_.Stop(); + + if (FilePoller_.joinable()) { + FilePoller_.join(); + } + return true; + } + + NYdb::NTopic::TReaderCounters::TPtr GetCounters() const override { + return Counters_; + } + + TString GetSessionId() const override { + return ToString(Session_->GetPartitionSessionId()); + } + + ~TFileTopicReadSession() { + EventsQ_.Stop(); + Pool_.Stop(); + if (FilePoller_.joinable()) { + FilePoller_.join(); + } + } + +private: + using TMessageInformation = NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessageInformation; + using TMessage = NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage; + + TMessageInformation MakeNextMessageInformation(size_t offset, size_t uncompressedSize, const TString& messageGroupId = "") { + auto now = TInstant::Now(); + TMessageInformation msgInfo( + offset, + ProducerId_, + SeqNo_, + now, + now, + MakeIntrusive(), + MakeIntrusive(), + uncompressedSize, + messageGroupId + ); + return msgInfo; + } + + TMessage MakeNextMessage(const TString& msgBuff) { + TMessage msg(msgBuff, nullptr, MakeNextMessageInformation(MsgOffset_, msgBuff.size()), Session_); + return msg; + } + + void PollFileForChanges() { + TFileInput fi(File_); + while (!EventsQ_.IsStopped()) { + TString rawMsg; + TVector msgs; + size_t size = 0; + ui64 maxBatchRowSize = 100; + + while (size_t read = fi.ReadLine(rawMsg)) { + msgs.emplace_back(MakeNextMessage(rawMsg)); + MsgOffset_++; + if (!maxBatchRowSize--) { + break; + } + size += rawMsg.size(); + } + if (!msgs.empty()) { + EventsQ_.Push(NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent(msgs, {}, Session_), size); + } + + Sleep(FILE_POLL_PERIOD); + } + } + + TFile File_; + TBlockingEQueue EventsQ_ {4_MB}; + NYdb::NTopic::TPartitionSession::TPtr Session_; + TString ProducerId_; + std::thread FilePoller_; + NYdb::NTopic::TReaderCounters::TPtr Counters_; + + TThreadPool Pool_; + size_t MsgOffset_ = 0; + ui64 SeqNo_ = 0; +}; + +struct TDummyPartitionSession: public NYdb::NTopic::TPartitionSession { + TDummyPartitionSession(ui64 sessionId, const TString& topicPath, ui64 partId) { + PartitionSessionId = sessionId; + TopicPath = topicPath; + PartitionId = partId; + } + + void RequestStatus() override { + // TODO send TPartitionSessionStatusEvent + } +}; + +std::shared_ptr TFileTopicClient::CreateReadSession(const NYdb::NTopic::TReadSessionSettings& settings) { + Y_ENSURE(!settings.Topics_.empty()); + TString topicPath = settings.Topics_.front().Path_; + + auto topicsIt = Topics_.find(make_pair("pq", topicPath)); + Y_ENSURE(topicsIt != Topics_.end()); + auto filePath = topicsIt->second.FilePath; + Y_ENSURE(filePath); + + // TODO + ui64 sessionId = 0; + ui64 partitionId = 0; + + return std::make_shared( + TFile(*filePath, EOpenMode::TEnum::RdOnly), + MakeIntrusive(sessionId, topicPath, partitionId) + ); +} + +NYdb::TAsyncStatus TFileTopicClient::CreateTopic(const TString& path, const NYdb::NTopic::TCreateTopicSettings& settings) { + Y_UNUSED(path); + Y_UNUSED(settings); + return NThreading::MakeFuture(NYdb::TStatus(NYdb::EStatus::SUCCESS, {})); +} + +NYdb::TAsyncStatus TFileTopicClient::AlterTopic(const TString& path, const NYdb::NTopic::TAlterTopicSettings& settings) { + Y_UNUSED(path); + Y_UNUSED(settings); + return NThreading::MakeFuture(NYdb::TStatus(NYdb::EStatus::SUCCESS, {})); +} + +NYdb::TAsyncStatus TFileTopicClient::DropTopic(const TString& path, const NYdb::NTopic::TDropTopicSettings& settings) { + Y_UNUSED(path); + Y_UNUSED(settings); + return NThreading::MakeFuture(NYdb::TStatus(NYdb::EStatus::SUCCESS, {})); +} + +NYdb::NTopic::TAsyncDescribeTopicResult TFileTopicClient::DescribeTopic(const TString& path, + const NYdb::NTopic::TDescribeTopicSettings& settings) { + Y_UNUSED(path); + Y_UNUSED(settings); + + NYdb::TStatus success(NYdb::EStatus::SUCCESS, {}); + return NThreading::MakeFuture(NYdb::NTopic::TDescribeTopicResult(std::move(success), {})); +} + +NYdb::NTopic::TAsyncDescribeConsumerResult TFileTopicClient::DescribeConsumer(const TString& path, const TString& consumer, + const NYdb::NTopic::TDescribeConsumerSettings& settings) { + Y_UNUSED(path); + Y_UNUSED(consumer); + Y_UNUSED(settings); + + NYdb::TStatus success(NYdb::EStatus::SUCCESS, {}); + return NThreading::MakeFuture(NYdb::NTopic::TDescribeConsumerResult(std::move(success), {})); +} + +NYdb::NTopic::TAsyncDescribePartitionResult TFileTopicClient::DescribePartition(const TString& path, i64 partitionId, + const NYdb::NTopic::TDescribePartitionSettings& settings) { + Y_UNUSED(path); + Y_UNUSED(partitionId); + Y_UNUSED(settings); + + NYdb::TStatus success(NYdb::EStatus::SUCCESS, {}); + return NThreading::MakeFuture(NYdb::NTopic::TDescribePartitionResult(std::move(success), {})); +} + +std::shared_ptr TFileTopicClient::CreateSimpleBlockingWriteSession( + const NYdb::NTopic::TWriteSessionSettings& settings) { + Y_UNUSED(settings); + return nullptr; +} + +std::shared_ptr TFileTopicClient::CreateWriteSession(const NYdb::NTopic::TWriteSessionSettings& settings) { + Y_UNUSED(settings); + return nullptr; +} + +NYdb::TAsyncStatus TFileTopicClient::CommitOffset(const TString& path, ui64 partitionId, const TString& consumerName, ui64 offset, + const NYdb::NTopic::TCommitOffsetSettings& settings) { + Y_UNUSED(path); + Y_UNUSED(partitionId); + Y_UNUSED(consumerName); + Y_UNUSED(offset); + Y_UNUSED(settings); + return NThreading::MakeFuture(NYdb::TStatus(NYdb::EStatus::SUCCESS, {})); +} + +} diff --git a/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_file_topic_client.h b/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_file_topic_client.h new file mode 100644 index 000000000000..f80426726159 --- /dev/null +++ b/ydb/library/yql/providers/pq/gateway/dummy/yql_pq_file_topic_client.h @@ -0,0 +1,36 @@ +#include "yql_pq_dummy_gateway.h" + +#include + +namespace NYql { +struct TFileTopicClient : public ITopicClient { + TFileTopicClient(THashMap topics): Topics_(topics) {} + + NYdb::TAsyncStatus CreateTopic(const TString& path, const NYdb::NTopic::TCreateTopicSettings& settings = {}) override; + + NYdb::TAsyncStatus AlterTopic(const TString& path, const NYdb::NTopic::TAlterTopicSettings& settings = {}) override; + + NYdb::TAsyncStatus DropTopic(const TString& path, const NYdb::NTopic::TDropTopicSettings& settings = {}) override; + + NYdb::NTopic::TAsyncDescribeTopicResult DescribeTopic(const TString& path, + const NYdb::NTopic::TDescribeTopicSettings& settings = {}) override; + + NYdb::NTopic::TAsyncDescribeConsumerResult DescribeConsumer(const TString& path, const TString& consumer, + const NYdb::NTopic::TDescribeConsumerSettings& settings = {}) override; + + NYdb::NTopic::TAsyncDescribePartitionResult DescribePartition(const TString& path, i64 partitionId, + const NYdb::NTopic::TDescribePartitionSettings& settings = {}) override; + + std::shared_ptr CreateReadSession(const NYdb::NTopic::TReadSessionSettings& settings) override; + + std::shared_ptr CreateSimpleBlockingWriteSession( + const NYdb::NTopic::TWriteSessionSettings& settings) override; + std::shared_ptr CreateWriteSession(const NYdb::NTopic::TWriteSessionSettings& settings) override; + + NYdb::TAsyncStatus CommitOffset(const TString& path, ui64 partitionId, const TString& consumerName, ui64 offset, + const NYdb::NTopic::TCommitOffsetSettings& settings = {}) override; + +private: + THashMap Topics_; +}; +} \ No newline at end of file diff --git a/ydb/library/yql/providers/pq/gateway/native/yql_pq_gateway.cpp b/ydb/library/yql/providers/pq/gateway/native/yql_pq_gateway.cpp index cdd6d3c09a6d..d23e69e03c02 100644 --- a/ydb/library/yql/providers/pq/gateway/native/yql_pq_gateway.cpp +++ b/ydb/library/yql/providers/pq/gateway/native/yql_pq_gateway.cpp @@ -40,6 +40,8 @@ class TPqNativeGateway : public IPqGateway { const TString& database, bool secure) override; + ITopicClient::TPtr GetTopicClient(const NYdb::TDriver& driver, const NYdb::NTopic::TTopicClientSettings& settings) override; + private: void InitClusterConfigs(); TPqSession::TPtr GetExistingSession(const TString& sessionId) const; @@ -138,6 +140,10 @@ IPqGateway::TPtr CreatePqNativeGateway(const TPqGatewayServices& services) { return MakeIntrusive(services); } +ITopicClient::TPtr TPqNativeGateway::GetTopicClient(const NYdb::TDriver& driver, const NYdb::NTopic::TTopicClientSettings& settings = NYdb::NTopic::TTopicClientSettings()) { + return MakeIntrusive(driver, settings); +} + TPqNativeGateway::~TPqNativeGateway() { Sessions.clear(); } diff --git a/ydb/library/yql/providers/pq/provider/ut/ya.make b/ydb/library/yql/providers/pq/provider/ut/ya.make new file mode 100644 index 000000000000..0fcfe0b8dc17 --- /dev/null +++ b/ydb/library/yql/providers/pq/provider/ut/ya.make @@ -0,0 +1,41 @@ +UNITTEST_FOR(ydb/library/yql/providers/pq/provider) + +SRCS( + yql_pq_ut.cpp +) + +PEERDIR( + library/cpp/lwtrace + library/cpp/lwtrace/mon + ydb/library/actors/wilson/protos + ydb/library/yql/core/facade + ydb/library/yql/core/file_storage + ydb/library/yql/core/services/mounts + ydb/library/yql/dq/comp_nodes + ydb/library/yql/dq/transform + ydb/library/yql/minikql/comp_nodes/llvm14 + ydb/library/yql/providers/common/comp_nodes + ydb/library/yql/providers/common/db_id_async_resolver + ydb/library/yql/providers/dq/local_gateway + ydb/library/yql/providers/dq/provider + ydb/library/yql/providers/pq/async_io + ydb/library/yql/providers/pq/gateway/dummy + ydb/library/yql/providers/pq/provider + ydb/library/yql/providers/solomon/gateway + ydb/library/yql/providers/solomon/provider + ydb/library/yql/public/udf/service/exception_policy + ydb/library/yql/sql/pg_dummy + ydb/public/sdk/cpp/client/ydb_params + ydb/public/sdk/cpp/client/ydb_persqueue_public/codecs +) + +YQL_LAST_ABI_VERSION() + +IF (SANITIZER_TYPE OR WITH_VALGRIND) + SIZE(LARGE) + TAG(ya:fat) +ELSE() + SIZE(MEDIUM) +ENDIF() + +END() diff --git a/ydb/library/yql/providers/pq/provider/ut/yql_pq_ut.cpp b/ydb/library/yql/providers/pq/provider/ut/yql_pq_ut.cpp new file mode 100644 index 000000000000..a84c91473980 --- /dev/null +++ b/ydb/library/yql/providers/pq/provider/ut/yql_pq_ut.cpp @@ -0,0 +1,231 @@ +#include + +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include + +#include + +#include + +#include +#include +#include + +#include +#include + +#include + +#include +#include + +namespace NYql { + +NDq::IDqAsyncIoFactory::TPtr CreateAsyncIoFactory(const NYdb::TDriver& driver, const IPqGateway::TPtr& pqGateway) { + auto factory = MakeIntrusive(); + RegisterDqPqReadActorFactory(*factory, driver, nullptr, pqGateway); + + RegisterDqPqWriteActorFactory(*factory, driver, nullptr); + return factory; +} + +bool RunPqProgram( + const TString& code, + bool optimizeOnly, + bool printExpr = false, + bool printTrace = false, + TString* errorsMessage = nullptr) { + NLog::YqlLoggerScope logger("cerr", false); + NLog::YqlLogger().SetComponentLevel(NLog::EComponent::Core, NLog::ELevel::DEBUG); + NLog::YqlLogger().SetComponentLevel(NLog::EComponent::ProviderRtmr, NLog::ELevel::DEBUG); + + IOutputStream* errorsOutput = &Cerr; + TMaybe errorsMessageOutput; + TMaybe tee; + if (errorsMessage) { + errorsMessageOutput.ConstructInPlace(*errorsMessage); + tee.ConstructInPlace(&*errorsMessageOutput, &Cerr); + errorsOutput = &*tee; + } + + // Gateways config. + TGatewaysConfig gatewaysConfig; + // pq + { + auto& pqClusterConfig = *gatewaysConfig.MutablePq()->MutableClusterMapping()->Add(); + pqClusterConfig.SetName("lb"); + pqClusterConfig.SetClusterType(NYql::TPqClusterConfig::CT_PERS_QUEUE); + pqClusterConfig.SetEndpoint("lb.ru"); + pqClusterConfig.SetConfigManagerEndpoint("cm.lb.ru"); + pqClusterConfig.SetTvmId(777); + } + + // solomon + { + auto& solomonClusterConfig = *gatewaysConfig.MutableSolomon()->MutableClusterMapping()->Add(); + solomonClusterConfig.SetName("sol"); + solomonClusterConfig.SetCluster("sol.ru"); + } + + // dq + { + auto& dqCfg = *gatewaysConfig.MutableDq(); + auto* setting = dqCfg.AddDefaultSettings(); + setting->SetName("EnableComputeActor"); + setting->SetValue("1"); + } + + auto functionRegistry = NKikimr::NMiniKQL::CreateFunctionRegistry(NKikimr::NMiniKQL::CreateBuiltinRegistry())->Clone(); + TVector dataProvidersInit; + + // pq + auto pqGateway = MakeIntrusive(); + pqGateway->AddDummyTopic(TDummyTopic("lb", "my_in_topic")); + pqGateway->AddDummyTopic(TDummyTopic("lb", "my_out_topic")); + dataProvidersInit.push_back(GetPqDataProviderInitializer(std::move(pqGateway))); + + // solomon + auto solomonGateway = CreateSolomonGateway(gatewaysConfig.GetSolomon()); + dataProvidersInit.push_back(GetSolomonDataProviderInitializer(std::move(solomonGateway))); + + // dq + auto dqCompFactory = NKikimr::NMiniKQL::GetCompositeWithBuiltinFactory({ + NYql::GetCommonDqFactory(), + NKikimr::NMiniKQL::GetYqlFactory() + }); + + auto dqTaskTransformFactory = NYql::CreateCompositeTaskTransformFactory({ + NYql::CreateCommonDqTaskTransformFactory() + }); + + const auto driverConfig = NYdb::TDriverConfig().SetLog(CreateLogBackend("cerr")); + NYdb::TDriver driver(driverConfig); + auto dqGateway = CreateLocalDqGateway(functionRegistry.Get(), dqCompFactory, dqTaskTransformFactory, {}, false/*spilling*/, CreateAsyncIoFactory(driver, pqGateway)); + + auto storage = NYql::CreateAsyncFileStorage({}); + dataProvidersInit.push_back(NYql::GetDqDataProviderInitializer(&CreateDqExecTransformer, dqGateway, dqCompFactory, {}, storage)); + + TExprContext moduleCtx; + IModuleResolver::TPtr moduleResolver; + YQL_ENSURE(GetYqlDefaultModuleResolver(moduleCtx, moduleResolver)); + + TProgramFactory factory(true, functionRegistry.Get(), 0ULL, dataProvidersInit, "ut"); + + factory.SetGatewaysConfig(&gatewaysConfig); + factory.SetModules(moduleResolver); + + TProgramPtr program = factory.Create("program", code); + program->ConfigureYsonResultFormat(NYson::EYsonFormat::Text); + + Cerr << "Parse SQL..." << Endl; + NSQLTranslation::TTranslationSettings sqlSettings; + sqlSettings.SyntaxVersion = 1; + sqlSettings.V0Behavior = NSQLTranslation::EV0Behavior::Disable; + sqlSettings.Flags.insert("DqEngineEnable"); + sqlSettings.Flags.insert("DqEngineForce"); + + sqlSettings.ClusterMapping["lb"] = PqProviderName; + sqlSettings.ClusterMapping["sol"] = SolomonProviderName; + if (!program->ParseSql(sqlSettings)) { + program->PrintErrorsTo(*errorsOutput); + return false; + } + program->AstRoot()->PrettyPrintTo(Cerr, NYql::TAstPrintFlags::PerLine | NYql::TAstPrintFlags::ShortQuote); + + + Cerr << "Compile..." << Endl; + if (!program->Compile("user")) { + program->PrintErrorsTo(*errorsOutput); + return false; + } + + auto exprOut = printExpr ? &Cout : nullptr; + auto traceOpt = printTrace ? &Cerr : nullptr; + + TProgram::TStatus status = TProgram::TStatus::Error; + if (optimizeOnly) { + Cerr << "Optimize..." << Endl; + status = program->Optimize("user", traceOpt, nullptr, exprOut); + } else { + Cerr << "Run..." << Endl; + status = program->Run("user", traceOpt, nullptr, exprOut); + } + + if (status == TProgram::TStatus::Error) { + if (printTrace) { + program->Print(traceOpt, nullptr); + } + program->PrintErrorsTo(*errorsOutput); + return false; + } + + driver.Stop(true); + + Cerr << "Done." << Endl; + return true; +} + +Y_UNIT_TEST_SUITE(YqlPqSimpleTests) { + + Y_UNIT_TEST(SelectWithNoSchema) { + auto code = R"( +USE lb; +PRAGMA pq.Consumer="my_test_consumer"; +INSERT INTO my_out_topic +SELECT Data FROM my_in_topic WHERE Data < "100"; + )"; + TString errorMessage; + auto res = RunPqProgram(code, true, true, true, &errorMessage); + UNIT_ASSERT_C(res, errorMessage); + } + + Y_UNIT_TEST(SelectWithSchema) { + auto code = R"( +USE lb; +PRAGMA pq.Consumer="my_test_consumer"; + +INSERT INTO my_out_topic +SELECT CAST(y as string) || x FROM lb.object(my_in_topic, "json") WITH SCHEMA (Int32 as y, String as x) + )"; + TString errorMessage; + auto res = RunPqProgram(code, true, true, true, &errorMessage); + UNIT_ASSERT_C(res, errorMessage); + } + + Y_UNIT_TEST(SelectStarWithSchema) { + auto code = R"( +USE lb; +PRAGMA pq.Consumer="my_test_consumer"; + +$q = SELECT * FROM lb.object(my_in_topic, "json") WITH SCHEMA (Int32 as y, String as x); +INSERT INTO my_out_topic +SELECT x FROM $q + )"; + TString errorMessage; + auto res = RunPqProgram(code, true, true, true, &errorMessage); + UNIT_ASSERT_C(res, errorMessage); + } + +} + +} // NYql diff --git a/ydb/library/yql/providers/pq/provider/ya.make b/ydb/library/yql/providers/pq/provider/ya.make index be8405e07576..92186b68b755 100644 --- a/ydb/library/yql/providers/pq/provider/ya.make +++ b/ydb/library/yql/providers/pq/provider/ya.make @@ -54,3 +54,7 @@ PEERDIR( YQL_LAST_ABI_VERSION() END() + +RECURSE_FOR_TESTS( + ut +) diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp b/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp index cd171c8dd446..ea93ce37449a 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp +++ b/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp @@ -132,7 +132,7 @@ class TPqDataSourceTypeAnnotationTransformer : public TVisitorTransformerBase { } TStatus HandleDqTopicSource(TExprBase input, TExprContext& ctx) { - if (!EnsureArgsCount(input.Ref(), 6, ctx)) { + if (!EnsureArgsCount(input.Ref(), 5, ctx)) { return TStatus::Error; } diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp b/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp index bac0ba92fbc7..3a305edaf59d 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp +++ b/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp @@ -124,17 +124,9 @@ class TPqDqIntegration: public TDqIntegrationBase { const auto token = "cluster:default_" + clusterName; - auto rowSchema = pqReadTopic.Topic().RowSpec().Ref().GetTypeAnn()->Cast()->GetType()->Cast(); - TExprNode::TListType colTypes; - const auto& typeItems = rowSchema->GetItems(); - colTypes.reserve(typeItems.size()); - const auto pos = read->Pos(); // TODO - std::transform(typeItems.cbegin(), typeItems.cend(), std::back_inserter(colTypes), - [&](const TItemExprType* item) { - return ctx.NewAtom(pos, FormatType(item->GetItemType())); - }); - auto columnTypes = ctx.NewList(pos, std::move(colTypes)); - + const auto& typeItems = pqReadTopic.Topic().RowSpec().Ref().GetTypeAnn()->Cast()->GetType()->Cast()->GetItems(); + const auto pos = read->Pos(); + TExprNode::TListType colNames; colNames.reserve(typeItems.size()); std::transform(typeItems.cbegin(), typeItems.cend(), std::back_inserter(colNames), @@ -142,7 +134,7 @@ class TPqDqIntegration: public TDqIntegrationBase { return ctx.NewAtom(pos, item->GetName()); }); auto columnNames = ctx.NewList(pos, std::move(colNames)); - + auto row = Build(ctx, read->Pos()) .Name("row") .Done(); @@ -153,7 +145,6 @@ class TPqDqIntegration: public TDqIntegrationBase { .Build() .Done().Ptr(); - return Build(ctx, read->Pos()) .Input() .Topic(pqReadTopic.Topic()) @@ -163,7 +154,6 @@ class TPqDqIntegration: public TDqIntegrationBase { .Name().Build(token) .Build() .FilterPredicate(emptyPredicate) - .ColumnTypes(std::move(columnTypes)) .Build() .RowType(ExpandType(pqReadTopic.Pos(), *rowType, ctx)) .DataSource(pqReadTopic.DataSource().Cast()) @@ -263,27 +253,27 @@ class TPqDqIntegration: public TDqIntegrationBase { srcDesc.AddMetadataFields(metadata.Value().Maybe().Cast().StringValue()); } - for (const auto& column : topicSource.Columns().Cast()) { - srcDesc.AddColumns(column.StringValue()); + const auto rowSchema = topic.RowSpec().Ref().GetTypeAnn()->Cast()->GetType()->Cast(); + for (const auto& item : rowSchema->GetItems()) { + srcDesc.AddColumns(TString(item->GetName())); + srcDesc.AddColumnTypes(FormatType(item->GetItemType())); } - for (const auto& columnTypes : topicSource.ColumnTypes().Cast()) { - srcDesc.AddColumnTypes(columnTypes.StringValue()); - } - NYql::NConnector::NApi::TPredicate predicateProto; if (auto predicate = topicSource.FilterPredicate(); !NYql::IsEmptyFilterPredicate(predicate)) { TStringBuilder err; if (!NYql::SerializeFilterPredicate(predicate, &predicateProto, err)) { - ythrow yexception() << "Failed to serialize filter predicate for source: " << err; + ctx.AddWarning(TIssue(ctx.GetPosition(node.Pos()), "Failed to serialize filter predicate for source: " + err)); + predicateProto.Clear(); } } - //sharedReading = true; - sharedReading = sharedReading && (format == "json_each_row"); + sharedReading = sharedReading && (format == "json_each_row" || format == "raw"); TString predicateSql = NYql::FormatWhere(predicateProto); if (sharedReading) { - srcDesc.SetPredicate(predicateSql); + if (format == "json_each_row") { + srcDesc.SetPredicate(predicateSql); + } srcDesc.SetSharedReading(true); } protoSettings.PackFrom(srcDesc); diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_gateway.h b/ydb/library/yql/providers/pq/provider/yql_pq_gateway.h index f46931022f27..0c324f17b39a 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_gateway.h +++ b/ydb/library/yql/providers/pq/provider/yql_pq_gateway.h @@ -1,4 +1,7 @@ #pragma once + +#include "yql_pq_topic_client.h" + #include #include #include @@ -25,6 +28,8 @@ struct IPqGateway : public TThrRefBase { // DS API. virtual NThreading::TFuture ListStreams(const TString& sessionId, const TString& cluster, const TString& database, const TString& token, ui32 limit, const TString& exclusiveStartStreamName = {}) = 0; + + virtual ITopicClient::TPtr GetTopicClient(const NYdb::TDriver& driver, const NYdb::NTopic::TTopicClientSettings& settings) = 0; virtual void UpdateClusterConfigs( const TString& clusterName, diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_logical_opt.cpp b/ydb/library/yql/providers/pq/provider/yql_pq_logical_opt.cpp index 92964948185a..2f75ccb70e61 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_logical_opt.cpp +++ b/ydb/library/yql/providers/pq/provider/yql_pq_logical_opt.cpp @@ -1,6 +1,7 @@ #include "yql_pq_provider_impl.h" #include +#include #include #include #include @@ -30,22 +31,20 @@ namespace { } }; -std::unordered_set GetUsedMetadataFields(const TCoExtractMembers& extract) { - std::unordered_set usedMetadataFields; - for (const auto extractMember : extract.Members()) { - if (FindPqMetaFieldDescriptorBySysColumn(extractMember.StringValue())) { - usedMetadataFields.emplace(extractMember.StringValue()); - } +std::unordered_set GetUsedColumnNames(const TCoExtractMembers& extractMembers) { + std::unordered_set usedColumnNames; + for (const auto& member : extractMembers.Members()) { + usedColumnNames.emplace(member.StringValue()); } - return usedMetadataFields; + return usedColumnNames; } -TVector DropUnusedMetadata(const TPqTopic& pqTopic, const std::unordered_set& usedMetadataFields) { +TVector DropUnusedMetadata(const TPqTopic& pqTopic, const std::unordered_set& usedColumnNames) { TVector newSourceMetadata; for (auto metadataItem : pqTopic.Metadata()) { auto metadataName = metadataItem.Cast().Value().Maybe().Cast().StringValue(); - if (usedMetadataFields.contains(metadataName)) { + if (FindPqMetaFieldDescriptorBySysColumn(metadataName) && usedColumnNames.contains(metadataName)) { newSourceMetadata.push_back(metadataItem); } } @@ -88,10 +87,10 @@ TCoNameValueTupleList DropUnusedMetadataFromDqWrapSettings( .Done(); } -TExprNode::TPtr DropUnusedMetadataFieldsFromRowType( +TExprNode::TPtr DropUnusedRowItems( TPositionHandle position, const TStructExprType* oldRowType, - const std::unordered_set& usedMetadataFields, + const std::unordered_set& usedColumnNames, TExprContext& ctx) { TVector newFields; @@ -99,7 +98,7 @@ TExprNode::TPtr DropUnusedMetadataFieldsFromRowType( for (auto itemExprType : oldRowType->GetItems()) { const auto columnName = TString(itemExprType->GetName()); - if (FindPqMetaFieldDescriptorBySysColumn(columnName) && !usedMetadataFields.contains(columnName)) { + if (!usedColumnNames.contains(columnName)) { continue; } @@ -109,14 +108,14 @@ TExprNode::TPtr DropUnusedMetadataFieldsFromRowType( return ExpandType(position, *ctx.MakeType(newFields), ctx); } -TExprNode::TPtr DropUnusedMetadataFieldsFromColumns( +TExprNode::TPtr DropUnusedColumns( TExprBase oldColumns, - const std::unordered_set& usedMetadataFields, + const std::unordered_set& usedColumnNames, TExprContext& ctx) { TExprNode::TListType res; for (const auto& column : oldColumns.Cast()) { - if (FindPqMetaFieldDescriptorBySysColumn(column.StringValue()) && !usedMetadataFields.contains(column.StringValue())) { + if (!usedColumnNames.contains(column.StringValue())) { continue; } @@ -160,57 +159,68 @@ class TPqLogicalOptProposalTransformer : public TOptimizeTransformerBase { }*/ TMaybeNode ExtractMembersOverDqWrap(TExprBase node, TExprContext& ctx) const { - const auto& extract = node.Cast(); - const auto& input = extract.Input(); - const auto dqSourceWrap = input.Maybe(); - const auto dqPqTopicSource = dqSourceWrap.Input().Maybe(); - const auto pqTopic = dqPqTopicSource.Topic().Maybe(); - if (!pqTopic) { + const auto& extractMembers = node.Cast(); + const auto& extractMembersInput = extractMembers.Input(); + const auto& maybeDqSourceWrap = extractMembersInput.Maybe(); + if (!maybeDqSourceWrap) { + return node; + } + + const auto& dqSourceWrap = maybeDqSourceWrap.Cast(); + if (dqSourceWrap.DataSource().Category() != PqProviderName) { + return node; + } + + const auto& maybeDqPqTopicSource = dqSourceWrap.Input().Maybe(); + if (!maybeDqPqTopicSource) { return node; } - const auto usedMetadataFields = GetUsedMetadataFields(extract); - const auto newSourceMetadata = DropUnusedMetadata(pqTopic.Cast(), usedMetadataFields); - if (newSourceMetadata.size() == pqTopic.Metadata().Cast().Size()) { + const auto& dqPqTopicSource = maybeDqPqTopicSource.Cast(); + const auto& pqTopic = dqPqTopicSource.Topic(); + + auto usedColumnNames = GetUsedColumnNames(extractMembers); + const TStructExprType* inputRowType = pqTopic.RowSpec().Ref().GetTypeAnn()->Cast()->GetType()->Cast(); + const TStructExprType* outputRowType = node.Ref().GetTypeAnn()->Cast()->GetItemType()->Cast(); + if (outputRowType->GetSize() == 0 && inputRowType->GetSize() > 0) { + auto item = GetLightColumn(*inputRowType); + YQL_ENSURE(item); + YQL_ENSURE(usedColumnNames.insert(TString(item->GetName())).second); + } + + const auto oldRowType = pqTopic.Ref().GetTypeAnn()->Cast()->GetItemType()->Cast(); + if (oldRowType->GetSize() == usedColumnNames.size()) { return node; } - const auto oldRowType = pqTopic.Ref().GetTypeAnn() - ->Cast()->GetItemType()->Cast(); + const auto& newSourceMetadata = DropUnusedMetadata(pqTopic, usedColumnNames); - auto newPqTopicSource = Build(ctx, node.Pos()) - .InitFrom(dqPqTopicSource.Cast()) + const TExprNode::TPtr newPqTopicSource = Build(ctx, dqPqTopicSource.Pos()) + .InitFrom(dqPqTopicSource) .Topic() - .InitFrom(pqTopic.Cast()) + .InitFrom(pqTopic) .Metadata().Add(newSourceMetadata).Build() - .Build(); - - if (dqPqTopicSource.Columns()) { - auto newColumns = DropUnusedMetadataFieldsFromColumns( - dqPqTopicSource.Columns().Cast(), - usedMetadataFields, - ctx); - newPqTopicSource.Columns(newColumns); - } + .RowSpec(DropUnusedRowItems(pqTopic.RowSpec().Pos(), inputRowType, usedColumnNames, ctx)) + .Build() + .Columns(DropUnusedColumns(dqPqTopicSource.Columns(), usedColumnNames, ctx)) + .Done() + .Ptr(); - const auto newDqSourceWrap = Build(ctx, node.Pos()) - .InitFrom(dqSourceWrap.Cast()) - .Input(newPqTopicSource.Done()) - .Settings(DropUnusedMetadataFromDqWrapSettings( - dqSourceWrap.Cast(), - newSourceMetadata, - ctx)) - .RowType(DropUnusedMetadataFieldsFromRowType( - node.Pos(), - oldRowType, - usedMetadataFields, - ctx)) + const TExprNode::TPtr newDqSourceWrap = Build(ctx, dqSourceWrap.Pos()) + .InitFrom(dqSourceWrap) + .Input(newPqTopicSource) + .Settings(DropUnusedMetadataFromDqWrapSettings(dqSourceWrap, newSourceMetadata, ctx)) + .RowType(DropUnusedRowItems(dqSourceWrap.RowType().Pos(), oldRowType, usedColumnNames, ctx)) .Done() .Ptr(); + if (outputRowType->GetSize() == usedColumnNames.size()) { + return newDqSourceWrap; + } + return Build(ctx, node.Pos()) - .InitFrom(extract) - .Input(ctx.ReplaceNode(input.Ptr(), dqSourceWrap.Ref(), newDqSourceWrap)) + .InitFrom(extractMembers) + .Input(ctx.ReplaceNode(extractMembersInput.Ptr(), dqSourceWrap.Ref(), newDqSourceWrap)) .Done(); } @@ -247,7 +257,6 @@ class TPqLogicalOptProposalTransformer : public TOptimizeTransformerBase { auto newFilterLambda = MakePushdownPredicate(flatmap.Lambda(), ctx, node.Pos(), TPushdownSettings()); if (!newFilterLambda) { - ctx.AddWarning(TIssue(ctx.GetPosition(node.Pos()), "No predicate to pushdown")); return node; } YQL_CLOG(INFO, ProviderPq) << "Build new TCoFlatMap with predicate"; diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_topic_client.h b/ydb/library/yql/providers/pq/provider/yql_pq_topic_client.h new file mode 100644 index 000000000000..2c463b9598af --- /dev/null +++ b/ydb/library/yql/providers/pq/provider/yql_pq_topic_client.h @@ -0,0 +1,89 @@ +#pragma once +#include + +namespace NYql { +class ITopicClient : public TThrRefBase { +public: + using TPtr = TIntrusivePtr; + + virtual NYdb::TAsyncStatus CreateTopic(const TString& path, const NYdb::NTopic::TCreateTopicSettings& settings = {}) = 0; + + virtual NYdb::TAsyncStatus AlterTopic(const TString& path, const NYdb::NTopic::TAlterTopicSettings& settings = {}) = 0; + + virtual NYdb::TAsyncStatus DropTopic(const TString& path, const NYdb::NTopic::TDropTopicSettings& settings = {}) = 0; + + virtual NYdb::NTopic::TAsyncDescribeTopicResult DescribeTopic(const TString& path, + const NYdb::NTopic::TDescribeTopicSettings& settings = {}) = 0; + + virtual NYdb::NTopic::TAsyncDescribeConsumerResult DescribeConsumer(const TString& path, const TString& consumer, + const NYdb::NTopic::TDescribeConsumerSettings& settings = {}) = 0; + + virtual NYdb::NTopic::TAsyncDescribePartitionResult DescribePartition(const TString& path, i64 partitionId, + const NYdb::NTopic::TDescribePartitionSettings& settings = {}) = 0; + + virtual std::shared_ptr CreateReadSession(const NYdb::NTopic::TReadSessionSettings& settings) = 0; + + virtual std::shared_ptr CreateSimpleBlockingWriteSession( + const NYdb::NTopic::TWriteSessionSettings& settings) = 0; + virtual std::shared_ptr CreateWriteSession(const NYdb::NTopic::TWriteSessionSettings& settings) = 0; + + virtual NYdb::TAsyncStatus CommitOffset(const TString& path, ui64 partitionId, const TString& consumerName, ui64 offset, + const NYdb::NTopic::TCommitOffsetSettings& settings = {}) = 0; +}; + +class TNativeTopicClient : public ITopicClient { +public: + TNativeTopicClient(const NYdb::TDriver& driver, const NYdb::NTopic::TTopicClientSettings& settings = {}): + Driver_(driver), Client_(Driver_, settings) {} + + NYdb::TAsyncStatus CreateTopic(const TString& path, const NYdb::NTopic::TCreateTopicSettings& settings = {}) override { + return Client_.CreateTopic(path, settings); + } + + NYdb::TAsyncStatus AlterTopic(const TString& path, const NYdb::NTopic::TAlterTopicSettings& settings = {}) override { + return Client_.AlterTopic(path, settings); + } + + NYdb::TAsyncStatus DropTopic(const TString& path, const NYdb::NTopic::TDropTopicSettings& settings = {}) override { + return Client_.DropTopic(path, settings); + } + + NYdb::NTopic::TAsyncDescribeTopicResult DescribeTopic(const TString& path, + const NYdb::NTopic::TDescribeTopicSettings& settings = {}) override { + return Client_.DescribeTopic(path, settings); + } + + NYdb::NTopic::TAsyncDescribeConsumerResult DescribeConsumer(const TString& path, const TString& consumer, + const NYdb::NTopic::TDescribeConsumerSettings& settings = {}) override { + return Client_.DescribeConsumer(path, consumer, settings); + } + + NYdb::NTopic::TAsyncDescribePartitionResult DescribePartition(const TString& path, i64 partitionId, + const NYdb::NTopic::TDescribePartitionSettings& settings = {}) override { + return Client_.DescribePartition(path, partitionId, settings); + } + + std::shared_ptr CreateReadSession(const NYdb::NTopic::TReadSessionSettings& settings) override { + return Client_.CreateReadSession(settings); + } + + std::shared_ptr CreateSimpleBlockingWriteSession( + const NYdb::NTopic::TWriteSessionSettings& settings) override { + return Client_.CreateSimpleBlockingWriteSession(settings); + } + + std::shared_ptr CreateWriteSession(const NYdb::NTopic::TWriteSessionSettings& settings) override { + return Client_.CreateWriteSession(settings); + } + + NYdb::TAsyncStatus CommitOffset(const TString& path, ui64 partitionId, const TString& consumerName, ui64 offset, + const NYdb::NTopic::TCommitOffsetSettings& settings = {}) override { + return Client_.CommitOffset(path, partitionId, consumerName, offset, settings); + } + + ~TNativeTopicClient() {} +private: + NYdb::TDriver Driver_; + NYdb::NTopic::TTopicClient Client_; +}; +} \ No newline at end of file diff --git a/ydb/library/yql/tools/dq/worker_node/main.cpp b/ydb/library/yql/tools/dq/worker_node/main.cpp index 9f7c1667f0bb..ef73247ed20a 100644 --- a/ydb/library/yql/tools/dq/worker_node/main.cpp +++ b/ydb/library/yql/tools/dq/worker_node/main.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -106,7 +107,16 @@ class TConcurrentInvokerFactory: public ITaskRunnerInvokerFactory { NDq::IDqAsyncIoFactory::TPtr CreateAsyncIoFactory(const NYdb::TDriver& driver, IHTTPGateway::TPtr httpGateway) { auto factory = MakeIntrusive(); - RegisterDqPqReadActorFactory(*factory, driver, nullptr); + + TPqGatewayServices pqServices( + driver, + nullptr, + nullptr, + std::make_shared(), + nullptr + ); + RegisterDqPqReadActorFactory(*factory, driver, nullptr, CreatePqNativeGateway(std::move(pqServices))); + RegisterYdbReadActorFactory(*factory, driver, nullptr); RegisterClickHouseReadActorFactory(*factory, nullptr, httpGateway); RegisterDqPqWriteActorFactory(*factory, driver, nullptr); diff --git a/ydb/library/yql/tools/dq/worker_node/ya.make b/ydb/library/yql/tools/dq/worker_node/ya.make index 6fbbddbd1b45..00863fb73f64 100644 --- a/ydb/library/yql/tools/dq/worker_node/ya.make +++ b/ydb/library/yql/tools/dq/worker_node/ya.make @@ -19,6 +19,7 @@ IF (NOT OS_WINDOWS) ydb/library/yql/providers/dq/stats_collector ydb/library/yql/providers/dq/task_runner ydb/library/yql/providers/pq/async_io + ydb/library/yql/providers/pq/gateway/native ydb/library/yql/providers/pq/proto ydb/library/yql/providers/s3/actors ydb/library/yql/providers/ydb/actors diff --git a/ydb/library/yql/tools/dqrun/dqrun.cpp b/ydb/library/yql/tools/dqrun/dqrun.cpp index 6d0f621f6e6c..5438406208ac 100644 --- a/ydb/library/yql/tools/dqrun/dqrun.cpp +++ b/ydb/library/yql/tools/dqrun/dqrun.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -74,8 +75,12 @@ #include #include +#include #include #include +#include +#include + #include #include @@ -167,6 +172,17 @@ void ReadGatewaysConfig(const TString& configFile, TGatewaysConfig* config, THas } } +void ReadFqConfig(const TString& fqCfgFile, NFq::NConfig::TConfig* fqConfig) { + if (fqCfgFile.empty()) { + return; + } + auto configData = TFileInput(fqCfgFile).ReadAll(); + using ::google::protobuf::TextFormat; + if (!TextFormat::ParseFromString(configData, fqConfig)) { + ythrow yexception() << "Bad format of fq configuration"; + } +} + void PatchGatewaysConfig(TGatewaysConfig* config, const TString& mrJobBin, const TString& mrJobUdfsDir, size_t numThreads, bool keepTemp) { @@ -269,13 +285,14 @@ NDq::IDqAsyncIoFactory::TPtr CreateAsyncIoFactory( ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, NKikimr::NMiniKQL::IFunctionRegistry& functionRegistry, size_t HTTPmaxTimeSeconds, - size_t maxRetriesCount) { + size_t maxRetriesCount, + IPqGateway::TPtr pqGateway) { auto factory = MakeIntrusive(); RegisterDqInputTransformLookupActorFactory(*factory); if (ytFileServices) { RegisterYtLookupActorFactory(*factory, ytFileServices, functionRegistry); } - RegisterDqPqReadActorFactory(*factory, driver, nullptr); + RegisterDqPqReadActorFactory(*factory, driver, nullptr, pqGateway); RegisterYdbReadActorFactory(*factory, driver, nullptr); RegisterDQSolomonReadActorFactory(*factory, nullptr); RegisterClickHouseReadActorFactory(*factory, nullptr, httpGateway); @@ -444,9 +461,34 @@ int RunProgram(TProgramPtr program, const TRunOptions& options, const THashMap>& additionalLocalServices) { + if (fqConfig.HasRowDispatcher() && fqConfig.GetRowDispatcher().GetEnabled()) { + NFq::IYqSharedResources::TPtr iSharedResources = NFq::CreateYqSharedResources( + fqConfig, + NKikimr::CreateYdbCredentialsProviderFactory, + MakeIntrusive()); + NFq::TYqSharedResources::TPtr yqSharedResources = NFq::TYqSharedResources::Cast(iSharedResources); + ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory; + + auto rowDispatcher = NFq::NewRowDispatcherService( + fqConfig.GetRowDispatcher(), + NKikimr::CreateYdbCredentialsProviderFactory, + yqSharedResources, + credentialsFactory, + "/tenant", + MakeIntrusive(), + pqGateway); + + additionalLocalServices.emplace_back( + NFq::RowDispatcherServiceActorId(), + TActorSetupCmd(rowDispatcher.release(), TMailboxType::Simple, 0)); + } +} + int RunMain(int argc, const char* argv[]) { TString gatewaysCfgFile; + TString fqCfgFile; TString progFile; TVector tablesMappingList; THashMap tablesMapping; @@ -507,6 +549,7 @@ int RunMain(int argc, const char* argv[]) TString opId; IQStoragePtr qStorage; TQContext qContext; + TVector pqFileList; NLastGetopt::TOpts opts = NLastGetopt::TOpts::Default(); opts.AddLongOption('p', "program", "Program to execute (use '-' to read from stdin)") @@ -535,6 +578,10 @@ int RunMain(int argc, const char* argv[]) .Optional() .RequiredArgument("FILE") .StoreResult(&gatewaysCfgFile); + opts.AddLongOption("fq-cfg", "federated query configuration file") + .Optional() + .RequiredArgument("FILE") + .StoreResult(&fqCfgFile); opts.AddLongOption("fs-cfg", "Path to file storage config") .Optional() .StoreResult(&fileStorageCfg); @@ -641,6 +688,7 @@ int RunMain(int argc, const char* argv[]) opts.AddLongOption("no-force-dq", "don't set force dq mode").Optional().NoArgument().SetFlag(&runOptions.NoForceDq); opts.AddLongOption("ansi-lexer", "Use ansi lexer").Optional().NoArgument().SetFlag(&runOptions.AnsiLexer); opts.AddLongOption('E', "emulate-yt", "Emulate YT tables").Optional().NoArgument().SetFlag(&emulateYt); + opts.AddLongOption("emulate-pq", "Emulate YDS with local file, accepts list of tables to emulate with following format: topic@filePath").Optional().AppendTo(&pqFileList); opts.AddLongOption("qstorage-dir", "directory for QStorage").StoreResult(&qStorageDir).DefaultValue("."); opts.AddLongOption("op-id", "QStorage operation id").StoreResult(&opId).DefaultValue("dummy_op"); opts.AddLongOption("capture", "write query metadata to QStorage").NoArgument(); @@ -791,6 +839,9 @@ int RunMain(int argc, const char* argv[]) setting->SetValue("1"); } + NFq::NConfig::TConfig fqConfig; + ReadFqConfig(fqCfgFile, &fqConfig); + if (res.Has("enable-spilling")) { auto* setting = gatewaysConfig.MutableDq()->AddDefaultSettings(); setting->SetName("SpillingEngine"); @@ -912,6 +963,7 @@ int RunMain(int argc, const char* argv[]) dataProvidersInit.push_back(GetS3DataProviderInitializer(httpGateway, nullptr, true, actorSystemManager->GetActorSystem())); } + IPqGateway::TPtr pqGateway; if (gatewaysConfig.HasPq()) { TPqGatewayServices pqServices( driver, @@ -920,7 +972,25 @@ int RunMain(int argc, const char* argv[]) std::make_shared(gatewaysConfig.GetPq()), funcRegistry.Get() ); - auto pqGateway = CreatePqNativeGateway(pqServices); + bool emulatePq = !pqFileList.empty(); + + if (emulatePq) { + auto fileGateway = MakeIntrusive(); + + for (auto& s : pqFileList) { + TStringBuf topicName, filePath; + TStringBuf(s).Split('@', topicName, filePath); + if (topicName.empty() || filePath.empty()) { + Cerr << "Incorrect table mapping, expected form topic@file" << Endl; + return 1; + } + fileGateway->AddDummyTopic(TDummyTopic("pq", TString(topicName), TString(filePath))); + } + pqGateway = std::move(fileGateway); + } else { + pqGateway = CreatePqNativeGateway(pqServices); + } + for (auto& cluster: gatewaysConfig.GetPq().GetClusterMapping()) { clusters.emplace(to_lower(cluster.GetName()), TString{PqProviderName}); } @@ -937,6 +1007,8 @@ int RunMain(int argc, const char* argv[]) clusters.emplace(to_lower(cluster.GetName()), TString{NYql::SolomonProviderName}); } } + TVector> additionalLocalServices; + InitFq(fqConfig, pqGateway, additionalLocalServices); std::function metricsPusherFactory = {}; @@ -960,8 +1032,8 @@ int RunMain(int argc, const char* argv[]) bool enableSpilling = res.Has("enable-spilling"); dqGateway = CreateLocalDqGateway(funcRegistry.Get(), dqCompFactory, dqTaskTransformFactory, dqTaskPreprocessorFactories, enableSpilling, - CreateAsyncIoFactory(driver, httpGateway, ytFileServices, genericClient, credentialsFactory, *funcRegistry, requestTimeout, maxRetries), threads, - metricsRegistry, metricsPusherFactory); + CreateAsyncIoFactory(driver, httpGateway, ytFileServices, genericClient, credentialsFactory, *funcRegistry, requestTimeout, maxRetries, pqGateway), threads, + metricsRegistry, metricsPusherFactory, std::move(additionalLocalServices)); } dataProvidersInit.push_back(GetDqDataProviderInitializer(&CreateDqExecTransformer, dqGateway, dqCompFactory, {}, storage)); diff --git a/ydb/library/yql/tools/dqrun/examples/fq.conf b/ydb/library/yql/tools/dqrun/examples/fq.conf new file mode 100644 index 000000000000..6dee04c46e01 --- /dev/null +++ b/ydb/library/yql/tools/dqrun/examples/fq.conf @@ -0,0 +1,21 @@ +RowDispatcher { + Enabled: true + TimeoutBeforeStartSessionSec: 2 + MaxSessionUsedMemory: 0 + SendStatusPeriodSec: 10 + WithoutConsumer: true + Coordinator { + CoordinationNodePath: "not_used" + Database { + Endpoint: "not_used:2135" + Database: "/not/used/database" + UseLocalMetadataService: true + UseSsl: true + ClientTimeoutSec: 70 + OperationTimeoutSec: 60 + CancelAfterSec: 60 + } + LocalMode: true + } +} + diff --git a/ydb/library/yql/tools/dqrun/examples/gateways.conf b/ydb/library/yql/tools/dqrun/examples/gateways.conf index e3c699301ed0..caa399a8eb99 100644 --- a/ydb/library/yql/tools/dqrun/examples/gateways.conf +++ b/ydb/library/yql/tools/dqrun/examples/gateways.conf @@ -68,6 +68,10 @@ Dq { Name: "EnableDqReplicate" Value: "true" } + DefaultSettings { + Name: "_TableTimeout" + Value: "600000" + } } Generic { @@ -133,3 +137,14 @@ SqlCore { TranslationFlags: ["FlexibleTypes", "DisableAnsiOptionalAs", "EmitAggApply"] } +Pq { + ClusterMapping { + Name: "pq" + Endpoint: "localhost:2135" + Database: "local" + ClusterType: CT_DATA_STREAMS + UseSsl: True + SharedReading:False + } +} + diff --git a/ydb/library/yql/tools/dqrun/ya.make b/ydb/library/yql/tools/dqrun/ya.make index 8a19b6aee64a..10e8cdc3ed06 100644 --- a/ydb/library/yql/tools/dqrun/ya.make +++ b/ydb/library/yql/tools/dqrun/ya.make @@ -53,6 +53,7 @@ ENDIF() ydb/library/yql/providers/dq/provider ydb/library/yql/providers/dq/provider/exec ydb/library/yql/providers/pq/async_io + ydb/library/yql/providers/pq/gateway/dummy ydb/library/yql/providers/pq/gateway/native ydb/library/yql/providers/pq/provider ydb/library/yql/providers/s3/actors @@ -92,7 +93,7 @@ ENDIF() ydb/library/yql/utils/actor_system ydb/core/fq/libs/actors ydb/core/fq/libs/db_id_async_resolver_impl - + ydb/core/fq/libs/init ydb/library/yql/udfs/common/clickhouse/client ) diff --git a/ydb/library/yql/tools/mrrun/mrrun.cpp b/ydb/library/yql/tools/mrrun/mrrun.cpp index 8f13c8f63555..6353ca5de4e3 100644 --- a/ydb/library/yql/tools/mrrun/mrrun.cpp +++ b/ydb/library/yql/tools/mrrun/mrrun.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -234,7 +235,16 @@ bool FillUsedUrls( NDq::IDqAsyncIoFactory::TPtr CreateAsyncIoFactory(const NYdb::TDriver& driver, IHTTPGateway::TPtr httpGateway) { auto factory = MakeIntrusive(); - RegisterDqPqReadActorFactory(*factory, driver, nullptr); + + TPqGatewayServices pqServices( + driver, + nullptr, + nullptr, + std::make_shared(), + nullptr + ); + RegisterDqPqReadActorFactory(*factory, driver, nullptr, CreatePqNativeGateway(pqServices)); + RegisterDqPqReadActorFactory(*factory, driver, nullptr, CreatePqNativeGateway(std::move(pqServices))); RegisterYdbReadActorFactory(*factory, driver, nullptr); RegisterClickHouseReadActorFactory(*factory, nullptr, httpGateway); RegisterDqPqWriteActorFactory(*factory, driver, nullptr); diff --git a/ydb/library/yql/tools/mrrun/ya.make b/ydb/library/yql/tools/mrrun/ya.make index d37afeb6487b..f81ede32e404 100644 --- a/ydb/library/yql/tools/mrrun/ya.make +++ b/ydb/library/yql/tools/mrrun/ya.make @@ -49,6 +49,7 @@ PEERDIR( ydb/library/yql/providers/dq/provider ydb/library/yql/providers/dq/provider/exec ydb/library/yql/providers/pq/async_io + ydb/library/yql/providers/pq/gateway/native ydb/library/yql/providers/s3/actors ydb/library/yql/providers/ydb/actors ydb/library/yql/providers/ydb/comp_nodes diff --git a/ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp b/ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp index 350c0bd5b40c..be8c7e128e22 100644 --- a/ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp +++ b/ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp @@ -4,6 +4,8 @@ #include #include +#include +#include #include @@ -22,7 +24,7 @@ struct TFixture : public TPqIoTestFixture { } void InitRdSource( - NYql::NPq::NProto::TDqPqTopicSource&& settings, + const NYql::NPq::NProto::TDqPqTopicSource& settings, i64 freeSpace = 1_MB) { CaSetup->Execute([&](TFakeActor& actor) { @@ -38,8 +40,9 @@ struct TFixture : public TPqIoTestFixture { const THashMap secureParams; const THashMap taskParams { {"pq", serializedParams} }; + NYql::NPq::NProto::TDqPqTopicSource copySettings = settings; auto [dqSource, dqSourceAsActor] = CreateDqPqRdReadActor( - std::move(settings), + std::move(copySettings), 0, NYql::NDq::TCollectStatsLevel::None, "query_1", @@ -50,7 +53,8 @@ struct TFixture : public TPqIoTestFixture { LocalRowDispatcherId, actor.GetHolderFactory(), MakeIntrusive(), - freeSpace); + freeSpace + ); actor.InitAsyncInput(dqSource, dqSourceAsActor); }); @@ -190,9 +194,8 @@ struct TFixture : public TPqIoTestFixture { }); } - - void StartSession() { - InitRdSource(BuildPqTopicSourceSettings("topicName")); + void StartSession(NYql::NPq::NProto::TDqPqTopicSource& settings, i64 freeSpace = 1_MB) { + InitRdSource(settings, freeSpace); SourceRead(UVParser); ExpectCoordinatorChangesSubscribe(); @@ -204,13 +207,14 @@ struct TFixture : public TPqIoTestFixture { MockAck(RowDispatcher1); } - void ProcessSomeJsons(ui64 offset, const std::vector& jsons, NActors::TActorId rowDispatcherId) { + void ProcessSomeJsons(ui64 offset, const std::vector& jsons, NActors::TActorId rowDispatcherId, + std::function(const NUdf::TUnboxedValue&)> uvParser = UVParser) { MockNewDataArrived(rowDispatcherId); ExpectGetNextBatch(rowDispatcherId); MockMessageBatch(offset, jsons, rowDispatcherId); - auto result = SourceReadDataUntil(UVParser, jsons.size()); + auto result = SourceReadDataUntil(uvParser, jsons.size()); AssertDataWithWatermarks(result, jsons, {}); } @@ -219,6 +223,8 @@ struct TFixture : public TPqIoTestFixture { const TString Json3 = "{\"dt\":300,\"value\":\"value3\"}"; const TString Json4 = "{\"dt\":400,\"value\":\"value4\"}"; + NYql::NPq::NProto::TDqPqTopicSource Source1 = BuildPqTopicSourceSettings("topicName"); + NActors::TActorId LocalRowDispatcherId; NActors::TActorId Coordinator1Id; NActors::TActorId Coordinator2Id; @@ -228,12 +234,12 @@ struct TFixture : public TPqIoTestFixture { Y_UNIT_TEST_SUITE(TDqPqRdReadActorTests) { Y_UNIT_TEST_F(TestReadFromTopic, TFixture) { - StartSession(); + StartSession(Source1); ProcessSomeJsons(0, {Json1, Json2}, RowDispatcher1); } Y_UNIT_TEST_F(SessionError, TFixture) { - StartSession(); + StartSession(Source1); TInstant deadline = Now() + TDuration::Seconds(5); auto future = CaSetup->AsyncInputPromises.FatalError.GetFuture(); @@ -252,7 +258,7 @@ Y_UNIT_TEST_SUITE(TDqPqRdReadActorTests) { } Y_UNIT_TEST_F(ReadWithFreeSpace, TFixture) { - StartSession(); + StartSession(Source1); MockNewDataArrived(RowDispatcher1); ExpectGetNextBatch(RowDispatcher1); @@ -275,7 +281,7 @@ Y_UNIT_TEST_SUITE(TDqPqRdReadActorTests) { { TFixture f; - f.StartSession(); + f.StartSession(f.Source1); f.ProcessSomeJsons(0, {f.Json1, f.Json2}, f.RowDispatcher1); // offsets: 0, 1 f.SaveSourceState(CreateCheckpoint(), state); @@ -283,7 +289,7 @@ Y_UNIT_TEST_SUITE(TDqPqRdReadActorTests) { } { TFixture f; - f.InitRdSource(BuildPqTopicSourceSettings("topicName")); + f.InitRdSource(f.Source1); f.SourceRead(UVParser); f.LoadSource(state); f.SourceRead(UVParser); @@ -303,7 +309,7 @@ Y_UNIT_TEST_SUITE(TDqPqRdReadActorTests) { } { TFixture f; - f.InitRdSource(BuildPqTopicSourceSettings("topicName")); + f.InitRdSource(f.Source1); f.SourceRead(UVParser); f.LoadSource(state); f.SourceRead(UVParser); @@ -321,7 +327,7 @@ Y_UNIT_TEST_SUITE(TDqPqRdReadActorTests) { } Y_UNIT_TEST_F(CoordinatorChanged, TFixture) { - StartSession(); + StartSession(Source1); ProcessSomeJsons(0, {Json1, Json2}, RowDispatcher1); MockMessageBatch(2, {Json3}, RowDispatcher1); @@ -341,8 +347,32 @@ Y_UNIT_TEST_SUITE(TDqPqRdReadActorTests) { ProcessSomeJsons(3, {Json4}, RowDispatcher2); } + Y_UNIT_TEST_F(Backpressure, TFixture) { + StartSession(Source1, 2_KB); + + TString json(900, 'c'); + ProcessSomeJsons(0, {json}, RowDispatcher1); + + MockNewDataArrived(RowDispatcher1); + ExpectGetNextBatch(RowDispatcher1); + MockMessageBatch(0, {json, json, json}, RowDispatcher1); + + MockNewDataArrived(RowDispatcher1); + ASSERT_THROW( + CaSetup->Runtime->GrabEdgeEvent(RowDispatcher1, TDuration::Seconds(0)), + NActors::TEmptyEventQueueException); + + auto result = SourceReadDataUntil(UVParser, 3); + AssertDataWithWatermarks(result, {json, json, json}, {}); + ExpectGetNextBatch(RowDispatcher1); + + MockMessageBatch(3, {Json1}, RowDispatcher1); + result = SourceReadDataUntil(UVParser, 1); + AssertDataWithWatermarks(result, {Json1}, {}); + } + Y_UNIT_TEST_F(RowDispatcherIsRestarted, TFixture) { - StartSession(); + StartSession(Source1); ProcessSomeJsons(0, {Json1, Json2}, RowDispatcher1); MockDisconnected(); MockConnected(); @@ -355,5 +385,18 @@ Y_UNIT_TEST_SUITE(TDqPqRdReadActorTests) { ProcessSomeJsons(2, {Json3}, RowDispatcher1); } + + Y_UNIT_TEST_F(IgnoreMessageIfNoSessions, TFixture) { + StartSession(Source1); + MockCoordinatorChanged(Coordinator2Id); + MockSessionError(); + } + + Y_UNIT_TEST_F(MetadataFields, TFixture) { + auto source = BuildPqTopicSourceSettings("topicName"); + source.AddMetadataFields("_yql_sys_create_time"); + StartSession(source); + ProcessSomeJsons(0, {Json1}, RowDispatcher1, UVParserWithMetadatafields); + } } } // NYql::NDq diff --git a/ydb/tests/fq/pq_async_io/ut/ya.make b/ydb/tests/fq/pq_async_io/ut/ya.make index 82f2450a647d..8d1e90145cc9 100644 --- a/ydb/tests/fq/pq_async_io/ut/ya.make +++ b/ydb/tests/fq/pq_async_io/ut/ya.make @@ -16,6 +16,7 @@ PEERDIR( ydb/library/yql/minikql/computation/llvm14 ydb/library/yql/providers/common/comp_nodes ydb/library/yql/providers/common/ut_helpers + ydb/library/yql/providers/pq/gateway/native ydb/library/yql/public/udf/service/exception_policy ydb/library/yql/sql ydb/public/sdk/cpp/client/ydb_datastreams diff --git a/ydb/tests/fq/pq_async_io/ut_helpers.cpp b/ydb/tests/fq/pq_async_io/ut_helpers.cpp index 907638e9fa1d..f74d16bfc492 100644 --- a/ydb/tests/fq/pq_async_io/ut_helpers.cpp +++ b/ydb/tests/fq/pq_async_io/ut_helpers.cpp @@ -1,6 +1,7 @@ #include "ut_helpers.h" #include +#include #include @@ -71,6 +72,14 @@ void TPqIoTestFixture::InitSource( const THashMap secureParams; const THashMap taskParams { {"pq", serializedParams} }; + TPqGatewayServices pqServices( + Driver, + nullptr, + nullptr, + std::make_shared(), + nullptr + ); + auto [dqSource, dqSourceAsActor] = CreateDqPqReadActor( std::move(settings), 0, @@ -84,6 +93,7 @@ void TPqIoTestFixture::InitSource( actor.SelfId(), actor.GetHolderFactory(), MakeIntrusive(), + CreatePqNativeGateway(std::move(pqServices)), freeSpace); actor.InitAsyncInput(dqSource, dqSourceAsActor); @@ -236,6 +246,12 @@ std::vector UVParser(const NUdf::TUnboxedValue& item) { return { TString(item.AsStringRef()) }; } +std::vector UVParserWithMetadatafields(const NUdf::TUnboxedValue& item) { + const auto& cell = item.GetElement(0); + TString str(cell.AsStringRef()); + return {str}; +} + void TPqIoTestFixture::AsyncOutputWrite(std::vector data, TMaybe checkpoint) { CaSetup->AsyncOutputWrite([data](NKikimr::NMiniKQL::THolderFactory& factory) { NKikimr::NMiniKQL::TUnboxedValueBatch batch; diff --git a/ydb/tests/fq/pq_async_io/ut_helpers.h b/ydb/tests/fq/pq_async_io/ut_helpers.h index 6e9f92007d2b..3f5446cdd0ae 100644 --- a/ydb/tests/fq/pq_async_io/ut_helpers.h +++ b/ydb/tests/fq/pq_async_io/ut_helpers.h @@ -126,5 +126,6 @@ void AddReadRule( const TString& streamName); std::vector UVParser(const NUdf::TUnboxedValue& item); +std::vector UVParserWithMetadatafields(const NUdf::TUnboxedValue& item); } diff --git a/ydb/tests/fq/pq_async_io/ya.make b/ydb/tests/fq/pq_async_io/ya.make index c27e93ce4ce6..b00c15a7a6ed 100644 --- a/ydb/tests/fq/pq_async_io/ya.make +++ b/ydb/tests/fq/pq_async_io/ya.make @@ -14,3 +14,7 @@ PEERDIR( YQL_LAST_ABI_VERSION() END() + +RECURSE_FOR_TESTS( + ut +) diff --git a/ydb/tests/fq/yds/test_row_dispatcher.py b/ydb/tests/fq/yds/test_row_dispatcher.py index 7cafe48661cc..7d59d333d620 100644 --- a/ydb/tests/fq/yds/test_row_dispatcher.py +++ b/ydb/tests/fq/yds/test_row_dispatcher.py @@ -83,29 +83,40 @@ def wait_row_dispatcher_sensor_value(kikimr, sensor, expected_count, exact_match class TestPqRowDispatcher(TestYdsBase): @yq_v1 - def test_read_raw_format_without_row_dispatcher(self, kikimr, client): + def test_read_raw_format_with_row_dispatcher(self, kikimr, client): client.create_yds_connection( YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True ) - self.init_topics("test_read_raw_format_without_row_dispatcher", create_output=False) + connections = client.list_connections(fq.Acl.Visibility.PRIVATE).result.connection + assert len(connections) == 1 + assert connections[0].content.setting.data_streams.shared_reading is True + self.init_topics("test_read_raw_format_without_row_dispatcher", create_output=False) output_topic = "pq_test_pq_read_write_output" - create_stream(output_topic, partitions_count=1) create_read_rule(output_topic, self.consumer_name) - sql = Rf''' - INSERT INTO {YDS_CONNECTION}.`{output_topic}` - SELECT * FROM {YDS_CONNECTION}.`{self.input_topic}`;''' + sql1 = Rf'''INSERT INTO {YDS_CONNECTION}.`{output_topic}` + SELECT * FROM {YDS_CONNECTION}.`{self.input_topic}` WITH (format=raw, SCHEMA (data String NOT NULL));''' - query_id = start_yds_query(kikimr, client, sql) + query_id = start_yds_query(kikimr, client, sql1) data = ['{"time" = 101;}', '{"time" = 102;}'] self.write_stream(data) - expected = data - assert self.read_stream(len(expected), topic_path=output_topic) == expected + assert self.read_stream(len(data), topic_path=output_topic) == data + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + stop_yds_query(client, query_id) - wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + sql2 = Rf'''INSERT INTO {YDS_CONNECTION}.`{output_topic}` + SELECT * FROM {YDS_CONNECTION}.`{self.input_topic}` WITH (format=raw, SCHEMA (data String NOT NULL)) + WHERE data != "romashka";''' + + query_id = start_yds_query(kikimr, client, sql2) + data = ['{"time" = 103;}', '{"time" = 104;}'] + + self.write_stream(data) + assert self.read_stream(len(data), topic_path=output_topic) == data + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) stop_yds_query(client, query_id) @yq_v1 @@ -144,7 +155,6 @@ def test_simple_not_null(self, kikimr, client): wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) @yq_v1 - @pytest.mark.skip(reason="Is not implemented") def test_simple_optional(self, kikimr, client): client.create_yds_connection( YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True @@ -193,7 +203,7 @@ def test_scheme_error(self, kikimr, client): client.wait_query_status(query_id, fq.QueryMeta.FAILED) issues = str(client.describe_query(query_id).result.query.issue) - assert "Failed to unwrap empty optional" in issues, "Incorrect Issues: " + issues + assert "Cannot parse JSON string" in issues, "Incorrect Issues: " + issues wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 0) wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) @@ -207,6 +217,40 @@ def test_scheme_error(self, kikimr, client): stop_yds_query(client, query_id) wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 0) + @yq_v1 + def test_nested_types(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_nested_types") + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` + SELECT data FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time UInt64 NOT NULL, data Json NOT NULL, event String NOT NULL)) + WHERE event = "event1" or event = "event2";''' + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + data = [ + '{"time": 101, "data": {"key": "value"}, "event": "event1"}', + '{"time": 102, "data": ["key1", "key2"], "event": "event2"}', + ] + + self.write_stream(data) + expected = [ + '{"key": "value"}', + '["key1", "key2"]' + ] + assert self.read_stream(len(expected), topic_path=self.output_topic) == expected + + wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 1) + stop_yds_query(client, query_id) + + issues = str(client.describe_query(query_id).result.query.transient_issue) + assert "Row dispatcher will use the predicate:" in issues, "Incorrect Issues: " + issues + @yq_v1 def test_filter(self, kikimr, client): client.create_yds_connection( @@ -243,6 +287,64 @@ def test_filter(self, kikimr, client): issues = str(client.describe_query(query_id).result.query.transient_issue) assert "Row dispatcher will use the predicate: WHERE (`time` > 101" in issues, "Incorrect Issues: " + issues + @yq_v1 + def test_filter_missing_fields(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_filter") + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time UInt64 NOT NULL, data String, event String NOT NULL)) + WHERE data IS NULL;''' + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + data = [ + '{"time": 101, "event": "event1"}', + '{"time": 102, "data": null, "event": "event2"}', + '{"time": 103, "data": "", "event": "event2"}', + '{"time": 104, "data": "null", "event": "event2"}', + ] + + self.write_stream(data) + expected = ['101', '102'] + assert self.read_stream(len(expected), topic_path=self.output_topic) == expected + + wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 1) + stop_yds_query(client, query_id) + + issues = str(client.describe_query(query_id).result.query.transient_issue) + assert "Row dispatcher will use the predicate:" in issues, "Incorrect Issues: " + issues + + @yq_v1 + def test_filter_use_unsupported_predicate(self, kikimr, client): + client.create_yds_connection( + YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True + ) + self.init_topics("test_filter_use_unsupported_predicate") + + sql = Rf''' + INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` + SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time UInt64 NOT NULL, data String NOT NULL, event String NOT NULL)) + WHERE event LIKE 'event2%';''' + + query_id = start_yds_query(kikimr, client, sql) + wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + + data = [ + '{"time": 102, "data": "hello2", "event": "event2"}', + ] + + self.write_stream(data) + assert self.read_stream(1, topic_path=self.output_topic) == ['102'] + wait_actor_count(kikimr, "DQ_PQ_READ_ACTOR", 1) + stop_yds_query(client, query_id) + @yq_v1 def test_filter_with_mr(self, kikimr, client): client.create_yds_connection( @@ -345,8 +447,8 @@ def test_start_new_query(self, kikimr, client): sql3 = Rf''' INSERT INTO {YDS_CONNECTION}.`{output_topic3}` - SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` - WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL, data String NOT NULL));''' + SELECT event FROM {YDS_CONNECTION}.`{self.input_topic}` + WITH (format=json_each_row, SCHEMA (time Int32 NOT NULL, event String NOT NULL));''' query_id3 = start_yds_query(kikimr, client, sql3) data = [ @@ -355,11 +457,11 @@ def test_start_new_query(self, kikimr, client): ] self.write_stream(data) - expected = ['103', '104'] - - assert self.read_stream(len(expected), topic_path=output_topic1) == expected - assert self.read_stream(len(expected), topic_path=output_topic2) == expected - assert self.read_stream(len(expected), topic_path=output_topic3) == expected + expected12 = ['103', '104'] + expected3 = ['event3', 'event4'] + assert self.read_stream(len(expected), topic_path=output_topic1) == expected12 + assert self.read_stream(len(expected), topic_path=output_topic2) == expected12 + assert self.read_stream(len(expected), topic_path=output_topic3) == expected3 wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) @@ -430,9 +532,9 @@ def test_stop_start_with_filter(self, kikimr, client): client.create_yds_connection( YDS_CONNECTION, os.getenv("YDB_DATABASE"), os.getenv("YDB_ENDPOINT"), shared_reading=True ) - self.init_topics("test_stop_start", create_output=False) + self.init_topics("test_stop_start_with_filter", create_output=False) - output_topic = "test_stop_start" + output_topic = "test_stop_start_with_filter" create_stream(output_topic, partitions_count=1) create_read_rule(output_topic, self.consumer_name) From dddb51ecf60c92f66bb69e923d86d3b438e41835 Mon Sep 17 00:00:00 2001 From: Alexey Pozdniakov Date: Fri, 18 Oct 2024 09:11:05 +0300 Subject: [PATCH 42/56] [YQ-3761] Sync stable (#10543) --- ydb/library/yql/dq/opt/dq_opt_log.cpp | 38 ++++++------------- .../expr_nodes/yql_clickhouse_expr_nodes.json | 7 ++-- .../yql_clickhouse_datasource_type_ann.cpp | 6 ++- .../yql_clickhouse_dq_integration.cpp | 1 + .../expr_nodes/yql_generic_expr_nodes.json | 11 +++--- .../yql_generic_datasource_type_ann.cpp | 6 ++- .../provider/yql_generic_dq_integration.cpp | 1 + .../pq/expr_nodes/yql_pq_expr_nodes.json | 11 +++--- .../provider/yql_pq_datasource_type_ann.cpp | 7 +++- .../pq/provider/yql_pq_dq_integration.cpp | 1 + .../s3/expr_nodes/yql_s3_expr_nodes.json | 23 +++++------ .../provider/yql_s3_datasource_type_ann.cpp | 12 +++++- .../s3/provider/yql_s3_dq_integration.cpp | 2 + .../expr_nodes/yql_solomon_expr_nodes.json | 25 ++++++------ .../yql_solomon_datasource_type_ann.cpp | 6 ++- .../provider/yql_solomon_dq_integration.cpp | 1 + .../ydb/expr_nodes/yql_ydb_expr_nodes.json | 7 ++-- .../provider/yql_ydb_datasource_type_ann.cpp | 6 ++- .../ydb/provider/yql_ydb_dq_integration.cpp | 1 + .../extracted | 5 --- .../hybrid_file/part2/canondata/result.json | 12 +++--- .../opt.yql | 4 +- .../opt.yql | 4 +- .../opt.yql | 4 +- .../opt.yql | 4 +- .../extracted | 5 --- 26 files changed, 114 insertions(+), 96 deletions(-) delete mode 100644 ydb/library/yql/tests/sql/dq_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted delete mode 100644 ydb/library/yql/tests/sql/yt_native_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted diff --git a/ydb/library/yql/dq/opt/dq_opt_log.cpp b/ydb/library/yql/dq/opt/dq_opt_log.cpp index 467fec490c9a..c8969a768f06 100644 --- a/ydb/library/yql/dq/opt/dq_opt_log.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_log.cpp @@ -182,32 +182,18 @@ static void CollectSinkStages(const NNodes::TDqQuery& dqQuery, THashSet(); - - THashSet sinkStages; - CollectSinkStages(dqQuery, sinkStages); - TOptimizeExprSettings settings{nullptr}; - settings.VisitLambdas = false; - bool deletedDqQueryChild = false; - TExprNode::TPtr newDqQueryNode; - auto status = OptimizeExpr(dqQueryNode.Ptr(), newDqQueryNode, [&sinkStages, &deletedDqQueryChild](const TExprNode::TPtr& node, TExprContext& ctx) -> TExprNode::TPtr { - for (ui32 childIndex = 0; childIndex < node->ChildrenSize(); ++childIndex) { - TExprNode* child = node->Child(childIndex); - if (child->IsCallable(NNodes::TDqQuery::CallableName())) { - NNodes::TDqQuery dqQueryChild(child); - CollectSinkStages(dqQueryChild, sinkStages); - deletedDqQueryChild = true; - return ctx.ChangeChild(*node, childIndex, dqQueryChild.World().Ptr()); - } - } - return node; - }, ctx, settings); - YQL_ENSURE(status != IGraphTransformer::TStatus::Error, "Failed to merge DqQuery nodes: " << status); - - if (deletedDqQueryChild) { - auto dqQueryBuilder = Build(ctx, dqQuery.Pos()); - dqQueryBuilder.World(newDqQueryNode->ChildPtr(TDqQuery::idx_World)); - + auto maybeDqQuery = dqQueryNode.Maybe(); + YQL_ENSURE(maybeDqQuery, "Expected DqQuery!"); + auto dqQuery = maybeDqQuery.Cast(); + + if (auto maybeDqQueryChild = dqQuery.World().Maybe()) { + auto dqQueryChild = maybeDqQueryChild.Cast(); + auto dqQueryBuilder = Build(ctx, dqQuery.Pos()) + .World(dqQueryChild.World()); + + THashSet sinkStages; + CollectSinkStages(dqQuery, sinkStages); + CollectSinkStages(maybeDqQueryChild.Cast(), sinkStages); auto sinkStagesBuilder = dqQueryBuilder.SinkStages(); for (const TExprNode::TPtr& stage : sinkStages) { sinkStagesBuilder.Add(stage); diff --git a/ydb/library/yql/providers/clickhouse/expr_nodes/yql_clickhouse_expr_nodes.json b/ydb/library/yql/providers/clickhouse/expr_nodes/yql_clickhouse_expr_nodes.json index a3378a39810d..358d2e112b8b 100644 --- a/ydb/library/yql/providers/clickhouse/expr_nodes/yql_clickhouse_expr_nodes.json +++ b/ydb/library/yql/providers/clickhouse/expr_nodes/yql_clickhouse_expr_nodes.json @@ -41,9 +41,10 @@ "Base": "TCallable", "Match": {"Type": "Callable", "Name": "ClSourceSettings"}, "Children": [ - {"Index": 0, "Name": "Table", "Type": "TCoAtom"}, - {"Index": 1, "Name": "Token", "Type": "TCoSecureParam"}, - {"Index": 2, "Name": "Columns", "Type": "TCoAtomList"} + {"Index": 0, "Name": "World", "Type": "TExprBase"}, + {"Index": 1, "Name": "Table", "Type": "TCoAtom"}, + {"Index": 2, "Name": "Token", "Type": "TCoSecureParam"}, + {"Index": 3, "Name": "Columns", "Type": "TCoAtomList"} ] } ] diff --git a/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_datasource_type_ann.cpp b/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_datasource_type_ann.cpp index fd18aa1286d4..7b3caf8cc043 100644 --- a/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_datasource_type_ann.cpp +++ b/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_datasource_type_ann.cpp @@ -25,7 +25,11 @@ class TClickHouseDataSourceTypeAnnotationTransformer : public TVisitorTransforme } TStatus HandleSourceSettings(const TExprNode::TPtr& input, TExprContext& ctx) { - if (!EnsureArgsCount(*input, 3U, ctx)) { + if (!EnsureArgsCount(*input, 4, ctx)) { + return TStatus::Error; + } + + if (!EnsureWorldType(*input->Child(TClSourceSettings::idx_World), ctx)) { return TStatus::Error; } diff --git a/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_dq_integration.cpp b/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_dq_integration.cpp index 0f997f1e4495..f9d370bdacbd 100644 --- a/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_dq_integration.cpp +++ b/ydb/library/yql/providers/clickhouse/provider/yql_clickhouse_dq_integration.cpp @@ -52,6 +52,7 @@ class TClickHouseDqIntegration: public TDqIntegrationBase { return Build(ctx, read->Pos()) .Input() + .World(clReadTable.World()) .Table(clReadTable.Table()) .Token() .Name().Build(token) diff --git a/ydb/library/yql/providers/generic/expr_nodes/yql_generic_expr_nodes.json b/ydb/library/yql/providers/generic/expr_nodes/yql_generic_expr_nodes.json index 88e63d15833d..90652bc3037e 100644 --- a/ydb/library/yql/providers/generic/expr_nodes/yql_generic_expr_nodes.json +++ b/ydb/library/yql/providers/generic/expr_nodes/yql_generic_expr_nodes.json @@ -41,11 +41,12 @@ "Base": "TCallable", "Match": {"Type": "Callable", "Name": "GenSourceSettings"}, "Children": [ - {"Index": 0, "Name": "Cluster", "Type": "TCoAtom"}, - {"Index": 1, "Name": "Table", "Type": "TCoAtom"}, - {"Index": 2, "Name": "Token", "Type": "TCoSecureParam"}, - {"Index": 3, "Name": "Columns", "Type": "TCoAtomList"}, - {"Index": 4, "Name": "FilterPredicate", "Type": "TCoLambda"} + {"Index": 0, "Name": "World", "Type": "TExprBase"}, + {"Index": 1, "Name": "Cluster", "Type": "TCoAtom"}, + {"Index": 2, "Name": "Table", "Type": "TCoAtom"}, + {"Index": 3, "Name": "Token", "Type": "TCoSecureParam"}, + {"Index": 4, "Name": "Columns", "Type": "TCoAtomList"}, + {"Index": 5, "Name": "FilterPredicate", "Type": "TCoLambda"} ] } ] diff --git a/ydb/library/yql/providers/generic/provider/yql_generic_datasource_type_ann.cpp b/ydb/library/yql/providers/generic/provider/yql_generic_datasource_type_ann.cpp index d7606a16c423..9640ef5f94be 100644 --- a/ydb/library/yql/providers/generic/provider/yql_generic_datasource_type_ann.cpp +++ b/ydb/library/yql/providers/generic/provider/yql_generic_datasource_type_ann.cpp @@ -49,7 +49,11 @@ namespace NYql { } TStatus HandleSourceSettings(const TExprNode::TPtr& input, TExprContext& ctx) { - if (!EnsureArgsCount(*input, 5, ctx)) { + if (!EnsureArgsCount(*input, 6, ctx)) { + return TStatus::Error; + } + + if (!EnsureWorldType(*input->Child(TGenSourceSettings::idx_World), ctx)) { return TStatus::Error; } diff --git a/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp b/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp index ea093f871458..7bf32674c40e 100644 --- a/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp +++ b/ydb/library/yql/providers/generic/provider/yql_generic_dq_integration.cpp @@ -85,6 +85,7 @@ namespace NYql { // clang-format off return Build(ctx, read->Pos()) .Input() + .World(genReadTable.World()) .Cluster(genReadTable.DataSource().Cluster()) .Table(genReadTable.Table()) .Token() diff --git a/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json b/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json index 0b178695aaeb..d70cc37688f4 100644 --- a/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json +++ b/ydb/library/yql/providers/pq/expr_nodes/yql_pq_expr_nodes.json @@ -67,11 +67,12 @@ "Base": "TCallable", "Match": {"Type": "Callable", "Name": "DqPqTopicSource"}, "Children": [ - {"Index": 0, "Name": "Topic", "Type": "TPqTopic"}, - {"Index": 1, "Name": "Columns", "Type": "TExprBase"}, - {"Index": 2, "Name": "Settings", "Type": "TCoNameValueTupleList"}, - {"Index": 3, "Name": "Token", "Type": "TCoSecureParam"}, - {"Index": 4, "Name": "FilterPredicate", "Type": "TCoLambda"} + {"Index": 0, "Name": "World", "Type": "TExprBase"}, + {"Index": 1, "Name": "Topic", "Type": "TPqTopic"}, + {"Index": 2, "Name": "Columns", "Type": "TExprBase"}, + {"Index": 3, "Name": "Settings", "Type": "TCoNameValueTupleList"}, + {"Index": 4, "Name": "Token", "Type": "TCoSecureParam"}, + {"Index": 5, "Name": "FilterPredicate", "Type": "TCoLambda"} ] }, { diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp b/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp index ea93ce37449a..5dff9c61584a 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp +++ b/ydb/library/yql/providers/pq/provider/yql_pq_datasource_type_ann.cpp @@ -132,11 +132,16 @@ class TPqDataSourceTypeAnnotationTransformer : public TVisitorTransformerBase { } TStatus HandleDqTopicSource(TExprBase input, TExprContext& ctx) { - if (!EnsureArgsCount(input.Ref(), 5, ctx)) { + if (!EnsureArgsCount(input.Ref(), 6, ctx)) { return TStatus::Error; } TDqPqTopicSource topicSource = input.Cast(); + + if (!EnsureWorldType(topicSource.World().Ref(), ctx)) { + return TStatus::Error; + } + TPqTopic topic = topicSource.Topic(); if (!EnsureCallable(topic.Ref(), ctx)) { diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp b/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp index 3a305edaf59d..530bda256dc0 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp +++ b/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp @@ -147,6 +147,7 @@ class TPqDqIntegration: public TDqIntegrationBase { return Build(ctx, read->Pos()) .Input() + .World(pqReadTopic.World()) .Topic(pqReadTopic.Topic()) .Columns(std::move(columnNames)) .Settings(BuildTopicReadSettings(clusterName, dqSettings, read->Pos(), format, ctx)) diff --git a/ydb/library/yql/providers/s3/expr_nodes/yql_s3_expr_nodes.json b/ydb/library/yql/providers/s3/expr_nodes/yql_s3_expr_nodes.json index 631e07dd3912..f7121ceaf650 100644 --- a/ydb/library/yql/providers/s3/expr_nodes/yql_s3_expr_nodes.json +++ b/ydb/library/yql/providers/s3/expr_nodes/yql_s3_expr_nodes.json @@ -45,10 +45,11 @@ "Base": "TCallable", "Match": {"Type": "CallableBase"}, "Children": [ - {"Index": 0, "Name": "Paths", "Type": "TS3Paths"}, - {"Index": 1, "Name": "Token", "Type": "TCoSecureParam"}, - {"Index": 2, "Name": "RowsLimitHint", "Type": "TCoAtom"}, - {"Index": 3, "Name": "Path", "Type": "TCoAtom"} + {"Index": 0, "Name": "World", "Type": "TExprBase"}, + {"Index": 1, "Name": "Paths", "Type": "TS3Paths"}, + {"Index": 2, "Name": "Token", "Type": "TCoSecureParam"}, + {"Index": 3, "Name": "RowsLimitHint", "Type": "TCoAtom"}, + {"Index": 4, "Name": "Path", "Type": "TCoAtom"} ] }, { @@ -56,9 +57,9 @@ "Base": "TS3SourceSettingsBase", "Match": {"Type": "Callable", "Name": "S3SourceSettings"}, "Children": [ - {"Index": 4, "Name": "SizeLimit", "Type": "TExprBase", "Optional": true}, - {"Index": 5, "Name": "PathPattern", "Type": "TExprBase", "Optional": true}, - {"Index": 6, "Name": "PathPatternVariant", "Type": "TExprBase", "Optional": true} + {"Index": 5, "Name": "SizeLimit", "Type": "TExprBase", "Optional": true}, + {"Index": 6, "Name": "PathPattern", "Type": "TExprBase", "Optional": true}, + {"Index": 7, "Name": "PathPatternVariant", "Type": "TExprBase", "Optional": true} ] }, { @@ -66,10 +67,10 @@ "Base": "TS3SourceSettingsBase", "Match": {"Type": "Callable", "Name": "S3ParseSettings"}, "Children": [ - {"Index": 4, "Name": "Format", "Type": "TCoAtom"}, - {"Index": 5, "Name": "RowType", "Type": "TExprBase"}, - {"Index": 6, "Name": "FilterPredicate", "Type": "TCoLambda"}, - {"Index": 7, "Name": "Settings", "Type": "TExprBase", "Optional": true} + {"Index": 5, "Name": "Format", "Type": "TCoAtom"}, + {"Index": 6, "Name": "RowType", "Type": "TExprBase"}, + {"Index": 7, "Name": "FilterPredicate", "Type": "TCoLambda"}, + {"Index": 8, "Name": "Settings", "Type": "TExprBase", "Optional": true} ] }, { diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_datasource_type_ann.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_datasource_type_ann.cpp index 6c9be1472d0d..35fdeadde733 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_datasource_type_ann.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_datasource_type_ann.cpp @@ -303,7 +303,11 @@ class TS3DataSourceTypeAnnotationTransformer : public TVisitorTransformerBase { } TStatus HandleS3SourceSettings(const TExprNode::TPtr& input, TExprContext& ctx) { - if (!EnsureMinArgsCount(*input, 4U, ctx)) { + if (!EnsureMinMaxArgsCount(*input, 5, 8, ctx)) { + return TStatus::Error; + } + + if (!EnsureWorldType(*input->Child(TS3SourceSettings::idx_World), ctx)) { return TStatus::Error; } @@ -335,7 +339,11 @@ class TS3DataSourceTypeAnnotationTransformer : public TVisitorTransformerBase { } TStatus HandleS3ParseSettings(const TExprNode::TPtr& input, TExprContext& ctx) { - if (!EnsureMinMaxArgsCount(*input, 7U, 8U, ctx)) { + if (!EnsureMinMaxArgsCount(*input, 8, 9, ctx)) { + return TStatus::Error; + } + + if (!EnsureWorldType(*input->Child(TS3ParseSettings::idx_World), ctx)) { return TStatus::Error; } diff --git a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp index d377e84ae8ca..9ab184da57b1 100644 --- a/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp +++ b/ydb/library/yql/providers/s3/provider/yql_s3_dq_integration.cpp @@ -286,6 +286,7 @@ class TS3DqIntegration: public TDqIntegrationBase { if (const auto useCoro = State_->Configuration->SourceCoroActor.Get(); (!useCoro || *useCoro) && format != "raw" && format != "json_list") { return Build(ctx, read->Pos()) .Input() + .World(s3ReadObject.World()) .Paths(s3ReadObject.Object().Paths()) .Token() .Name().Build(token) @@ -331,6 +332,7 @@ class TS3DqIntegration: public TDqIntegrationBase { auto emptyNode = Build(ctx, read->Pos()).Done().Ptr(); return Build(ctx, read->Pos()) .Input() + .World(s3ReadObject.World()) .Paths(s3ReadObject.Object().Paths()) .Token() .Name().Build(token) diff --git a/ydb/library/yql/providers/solomon/expr_nodes/yql_solomon_expr_nodes.json b/ydb/library/yql/providers/solomon/expr_nodes/yql_solomon_expr_nodes.json index 05798f70cbbd..fe28e1d7acc6 100644 --- a/ydb/library/yql/providers/solomon/expr_nodes/yql_solomon_expr_nodes.json +++ b/ydb/library/yql/providers/solomon/expr_nodes/yql_solomon_expr_nodes.json @@ -48,18 +48,19 @@ "Base": "TCallable", "Match": {"Type": "Callable", "Name": "SoSourceSettings"}, "Children": [ - {"Index": 0, "Name": "Project", "Type": "TCoAtom"}, - {"Index": 1, "Name": "Token", "Type": "TCoSecureParam"}, - {"Index": 2, "Name": "RowType", "Type": "TExprBase"}, - {"Index": 3, "Name": "SystemColumns", "Type": "TCoAtomList"}, - {"Index": 4, "Name": "LabelNames", "Type": "TCoAtomList"}, - {"Index": 5, "Name": "From", "Type": "TCoAtom"}, - {"Index": 6, "Name": "To", "Type": "TCoAtom"}, - {"Index": 7, "Name": "Program", "Type": "TCoAtom"}, - {"Index": 8, "Name": "DownsamplingDisabled", "Type": "TCoBool"}, - {"Index": 9, "Name": "DownsamplingAggregation", "Type": "TCoAtom"}, - {"Index": 10, "Name": "DownsamplingFill", "Type": "TCoAtom"}, - {"Index": 11, "Name": "DownsamplingGridSec", "Type": "TCoUint32"} + {"Index": 0, "Name": "World", "Type": "TExprBase"}, + {"Index": 1, "Name": "Project", "Type": "TCoAtom"}, + {"Index": 2, "Name": "Token", "Type": "TCoSecureParam"}, + {"Index": 3, "Name": "RowType", "Type": "TExprBase"}, + {"Index": 4, "Name": "SystemColumns", "Type": "TCoAtomList"}, + {"Index": 5, "Name": "LabelNames", "Type": "TCoAtomList"}, + {"Index": 6, "Name": "From", "Type": "TCoAtom"}, + {"Index": 7, "Name": "To", "Type": "TCoAtom"}, + {"Index": 8, "Name": "Program", "Type": "TCoAtom"}, + {"Index": 9, "Name": "DownsamplingDisabled", "Type": "TCoBool"}, + {"Index": 10, "Name": "DownsamplingAggregation", "Type": "TCoAtom"}, + {"Index": 11, "Name": "DownsamplingFill", "Type": "TCoAtom"}, + {"Index": 12, "Name": "DownsamplingGridSec", "Type": "TCoUint32"} ] }, { diff --git a/ydb/library/yql/providers/solomon/provider/yql_solomon_datasource_type_ann.cpp b/ydb/library/yql/providers/solomon/provider/yql_solomon_datasource_type_ann.cpp index adbaf60e944e..e1f2fe94e95d 100644 --- a/ydb/library/yql/providers/solomon/provider/yql_solomon_datasource_type_ann.cpp +++ b/ydb/library/yql/providers/solomon/provider/yql_solomon_datasource_type_ann.cpp @@ -34,7 +34,11 @@ class TSolomonDataSourceTypeAnnotationTransformer : public TVisitorTransformerBa } TStatus HandleSoSourceSettings(const TExprNode::TPtr& input, TExprContext& ctx) { - if (!EnsureArgsCount(*input, 12U, ctx)) { + if (!EnsureArgsCount(*input, 13, ctx)) { + return TStatus::Error; + } + + if (!EnsureWorldType(*input->Child(TSoSourceSettings::idx_World), ctx)) { return TStatus::Error; } diff --git a/ydb/library/yql/providers/solomon/provider/yql_solomon_dq_integration.cpp b/ydb/library/yql/providers/solomon/provider/yql_solomon_dq_integration.cpp index b675a22a91c6..a6a8eb14a8ba 100644 --- a/ydb/library/yql/providers/solomon/provider/yql_solomon_dq_integration.cpp +++ b/ydb/library/yql/providers/solomon/provider/yql_solomon_dq_integration.cpp @@ -211,6 +211,7 @@ class TSolomonDqIntegration: public TDqIntegrationBase { return Build(ctx, read->Pos()) .Input() + .World(soReadObject.World()) .Project(soReadObject.Object().Project()) .Token() .Name().Build(token) diff --git a/ydb/library/yql/providers/ydb/expr_nodes/yql_ydb_expr_nodes.json b/ydb/library/yql/providers/ydb/expr_nodes/yql_ydb_expr_nodes.json index 534f097ec39d..62557c87f9a8 100644 --- a/ydb/library/yql/providers/ydb/expr_nodes/yql_ydb_expr_nodes.json +++ b/ydb/library/yql/providers/ydb/expr_nodes/yql_ydb_expr_nodes.json @@ -30,9 +30,10 @@ "Base": "TCallable", "Match": {"Type": "Callable", "Name": "YdbSourceSettings"}, "Children": [ - {"Index": 0, "Name": "Table", "Type": "TCoAtom"}, - {"Index": 1, "Name": "Token", "Type": "TCoSecureParam"}, - {"Index": 2, "Name": "Columns", "Type": "TCoAtomList"} + {"Index": 0, "Name": "World", "Type": "TExprBase"}, + {"Index": 1, "Name": "Table", "Type": "TCoAtom"}, + {"Index": 2, "Name": "Token", "Type": "TCoSecureParam"}, + {"Index": 3, "Name": "Columns", "Type": "TCoAtomList"} ] }, { diff --git a/ydb/library/yql/providers/ydb/provider/yql_ydb_datasource_type_ann.cpp b/ydb/library/yql/providers/ydb/provider/yql_ydb_datasource_type_ann.cpp index edbc95c25e07..c03b2a3191e5 100644 --- a/ydb/library/yql/providers/ydb/provider/yql_ydb_datasource_type_ann.cpp +++ b/ydb/library/yql/providers/ydb/provider/yql_ydb_datasource_type_ann.cpp @@ -29,7 +29,11 @@ class TYdbDataSourceTypeAnnotationTransformer : public TVisitorTransformerBase { TStatus HandleYdbSourceSettings(const TExprNode::TPtr& input, TExprNode::TPtr& output, TExprContext& ctx) { Y_UNUSED(output); - if (!EnsureArgsCount(*input, 3U, ctx)) { + if (!EnsureArgsCount(*input, 4, ctx)) { + return TStatus::Error; + } + + if (!EnsureWorldType(*input->Child(TYdbSourceSettings::idx_World), ctx)) { return TStatus::Error; } diff --git a/ydb/library/yql/providers/ydb/provider/yql_ydb_dq_integration.cpp b/ydb/library/yql/providers/ydb/provider/yql_ydb_dq_integration.cpp index 7ff7fc1c5cc7..9f165f0ba32c 100644 --- a/ydb/library/yql/providers/ydb/provider/yql_ydb_dq_integration.cpp +++ b/ydb/library/yql/providers/ydb/provider/yql_ydb_dq_integration.cpp @@ -101,6 +101,7 @@ class TYdbDqIntegration: public TDqIntegrationBase { return Build(ctx, read->Pos()) .Input() + .World(ydbReadTable.World()) .Table(ydbReadTable.Table()) .Token() .Name().Build(token) diff --git a/ydb/library/yql/tests/sql/dq_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted b/ydb/library/yql/tests/sql/dq_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted deleted file mode 100644 index abd564f4a8c8..000000000000 --- a/ydb/library/yql/tests/sql/dq_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted +++ /dev/null @@ -1,5 +0,0 @@ -/program.sql:
: Fatal: Optimization - - /program.sql:
:8:1: Fatal: ydb/library/yql/core/yql_opt_match_recognize.cpp:xxx ExpandMatchRecognize(): requirement sortOrder->ChildrenSize() == 1 failed, message: Expect ORDER BY timestamp for MATCH_RECOGNIZE - select * from (select * from AS_TABLE($data) MATCH_RECOGNIZE( - ^ \ No newline at end of file diff --git a/ydb/library/yql/tests/sql/hybrid_file/part2/canondata/result.json b/ydb/library/yql/tests/sql/hybrid_file/part2/canondata/result.json index e59e9089a1b0..57260df1270c 100644 --- a/ydb/library/yql/tests/sql/hybrid_file/part2/canondata/result.json +++ b/ydb/library/yql/tests/sql/hybrid_file/part2/canondata/result.json @@ -2871,9 +2871,9 @@ ], "test.test[window-full/session--Debug]": [ { - "checksum": "b06da41f9a9ea38646c43487f4b8b96a", - "size": 13340, - "uri": "https://{canondata_backend}/1775319/8ac8c87858e0db34f5a3c99b3f4ca1084cccbace/resource.tar.gz#test.test_window-full_session--Debug_/opt.yql_patched" + "checksum": "497487e14c39e6f6eeacce4046bfb2f5", + "size": 14172, + "uri": "https://{canondata_backend}/1777230/34342462b74dcf33b334172059f34ab28ba50ba6/resource.tar.gz#test.test_window-full_session--Debug_/opt.yql_patched" } ], "test.test[window-full/session--Plan]": [ @@ -2885,9 +2885,9 @@ ], "test.test[window-full/session_aliases--Debug]": [ { - "checksum": "e021555a47e83d0b792765a8ee82be94", - "size": 14124, - "uri": "https://{canondata_backend}/1775319/8ac8c87858e0db34f5a3c99b3f4ca1084cccbace/resource.tar.gz#test.test_window-full_session_aliases--Debug_/opt.yql_patched" + "checksum": "cf18b79ffda288cb9cf374749f268298", + "size": 14969, + "uri": "https://{canondata_backend}/1777230/34342462b74dcf33b334172059f34ab28ba50ba6/resource.tar.gz#test.test_window-full_session_aliases--Debug_/opt.yql_patched" } ], "test.test[window-full/session_aliases--Plan]": [ diff --git a/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-Basic-default.txt_/opt.yql b/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-Basic-default.txt_/opt.yql index 6576fbab7b62..4ba8a34de0a9 100644 --- a/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-Basic-default.txt_/opt.yql +++ b/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-Basic-default.txt_/opt.yql @@ -4,9 +4,9 @@ (let $3 (DataType 'String)) (let $4 (StructType '('"kind" $3) '('"labels" (DictType $3 $3)) '('"ts" (DataType 'Datetime)) '('type $3) '('"value" (DataType 'Double)))) (let $5 '('"kind" '"labels" '"value" '"ts" 'type)) -(let $6 (SoSourceSettings '"my_project" (SecureParam '"cluster:default_local_solomon") $4 $5 '() '"2023-12-08T14:40:39Z" '"2023-12-08T14:45:39Z" '"{}" (Bool '"false") '"AVG" '"PREVIOUS" (Uint32 '"15"))) +(let $6 (SoSourceSettings world '"my_project" (SecureParam '"cluster:default_local_solomon") $4 $5 '() '"2023-12-08T14:40:39Z" '"2023-12-08T14:45:39Z" '"{}" (Bool '"false") '"AVG" '"PREVIOUS" (Uint32 '"15"))) (let $7 (DqStage '((DqSource (DataSource '"solomon" '"local_solomon") $6)) (lambda '($10) $10) '('('"_logical_id" '199321)))) -(let $8 (DqStage '((DqCnUnionAll (TDqOutput $7 '"0"))) (lambda '($11) $11) '('('"_logical_id" '199342)))) +(let $8 (DqStage '((DqCnUnionAll (TDqOutput $7 '"0"))) (lambda '($11) $11) '('('"_logical_id" '199348)))) (let $9 (ResPull! $1 $2 (Key) (DqCnResult (TDqOutput $8 '"0") '()) '('('type) '('autoref)) '"dq")) (return (Commit! (Commit! $9 $2) (DataSink '"solomon" '"local_solomon"))) ) diff --git a/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-Downsampling-default.txt_/opt.yql b/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-Downsampling-default.txt_/opt.yql index b2bd63ebba1e..f72069cc7f3b 100644 --- a/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-Downsampling-default.txt_/opt.yql +++ b/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-Downsampling-default.txt_/opt.yql @@ -4,9 +4,9 @@ (let $3 (DataType 'String)) (let $4 (StructType '('"kind" $3) '('"labels" (DictType $3 $3)) '('"ts" (DataType 'Datetime)) '('type $3) '('"value" (DataType 'Double)))) (let $5 '('"kind" '"labels" '"value" '"ts" 'type)) -(let $6 (SoSourceSettings '"my_project" (SecureParam '"cluster:default_local_solomon") $4 $5 '() '"2023-12-08T14:40:39Z" '"2023-12-08T14:45:39Z" '"{}" (Bool '"true") '"SUM" '"PREVIOUS" (Uint32 '"25"))) +(let $6 (SoSourceSettings world '"my_project" (SecureParam '"cluster:default_local_solomon") $4 $5 '() '"2023-12-08T14:40:39Z" '"2023-12-08T14:45:39Z" '"{}" (Bool '"true") '"SUM" '"PREVIOUS" (Uint32 '"25"))) (let $7 (DqStage '((DqSource (DataSource '"solomon" '"local_solomon") $6)) (lambda '($10) $10) '('('"_logical_id" '199345)))) -(let $8 (DqStage '((DqCnUnionAll (TDqOutput $7 '"0"))) (lambda '($11) $11) '('('"_logical_id" '199366)))) +(let $8 (DqStage '((DqCnUnionAll (TDqOutput $7 '"0"))) (lambda '($11) $11) '('('"_logical_id" '199372)))) (let $9 (ResPull! $1 $2 (Key) (DqCnResult (TDqOutput $8 '"0") '()) '('('type) '('autoref)) '"dq")) (return (Commit! (Commit! $9 $2) (DataSink '"solomon" '"local_solomon"))) ) diff --git a/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-DownsamplingValidSettings-default.txt_/opt.yql b/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-DownsamplingValidSettings-default.txt_/opt.yql index 2bd63c06c6fb..a2b215515150 100644 --- a/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-DownsamplingValidSettings-default.txt_/opt.yql +++ b/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-DownsamplingValidSettings-default.txt_/opt.yql @@ -4,9 +4,9 @@ (let $3 (DataType 'String)) (let $4 (StructType '('"kind" $3) '('"labels" (DictType $3 $3)) '('"ts" (DataType 'Datetime)) '('type $3) '('"value" (DataType 'Double)))) (let $5 '('"kind" '"labels" '"value" '"ts" 'type)) -(let $6 (SoSourceSettings '"my_project" (SecureParam '"cluster:default_local_solomon") $4 $5 '() '"2023-12-08T14:40:39Z" '"2023-12-08T14:45:39Z" '"{}" (Bool '"false") '"SUM" '"PREVIOUS" (Uint32 '"15"))) +(let $6 (SoSourceSettings world '"my_project" (SecureParam '"cluster:default_local_solomon") $4 $5 '() '"2023-12-08T14:40:39Z" '"2023-12-08T14:45:39Z" '"{}" (Bool '"false") '"SUM" '"PREVIOUS" (Uint32 '"15"))) (let $7 (DqStage '((DqSource (DataSource '"solomon" '"local_solomon") $6)) (lambda '($10) $10) '('('"_logical_id" '199333)))) -(let $8 (DqStage '((DqCnUnionAll (TDqOutput $7 '"0"))) (lambda '($11) $11) '('('"_logical_id" '199354)))) +(let $8 (DqStage '((DqCnUnionAll (TDqOutput $7 '"0"))) (lambda '($11) $11) '('('"_logical_id" '199360)))) (let $9 (ResPull! $1 $2 (Key) (DqCnResult (TDqOutput $8 '"0") '()) '('('type) '('autoref)) '"dq")) (return (Commit! (Commit! $9 $2) (DataSink '"solomon" '"local_solomon"))) ) diff --git a/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-LabelColumns-default.txt_/opt.yql b/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-LabelColumns-default.txt_/opt.yql index 06f24e141131..eeb3bfecb443 100644 --- a/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-LabelColumns-default.txt_/opt.yql +++ b/ydb/library/yql/tests/sql/solomon/canondata/test.test_solomon-LabelColumns-default.txt_/opt.yql @@ -8,9 +8,9 @@ (let $7 (StructType '($3 $4) '($5 $4) '($6 $4) '('"kind" $4) '('"project" $4) '('"ts" (DataType 'Datetime)) '('type $4) '('"value" (DataType 'Double)))) (let $8 '('"kind" '"value" '"ts" 'type)) (let $9 '($3 $5 '"project" $6)) -(let $10 (SoSourceSettings '"my_project" (SecureParam '"cluster:default_local_solomon") $7 $8 $9 '"2023-12-08T14:40:39Z" '"2023-12-08T14:45:39Z" '"{}" (Bool '"false") '"AVG" '"PREVIOUS" (Uint32 '"15"))) +(let $10 (SoSourceSettings world '"my_project" (SecureParam '"cluster:default_local_solomon") $7 $8 $9 '"2023-12-08T14:40:39Z" '"2023-12-08T14:45:39Z" '"{}" (Bool '"false") '"AVG" '"PREVIOUS" (Uint32 '"15"))) (let $11 (DqStage '((DqSource (DataSource '"solomon" '"local_solomon") $10)) (lambda '($14) $14) '('('"_logical_id" '199359)))) -(let $12 (DqStage '((DqCnUnionAll (TDqOutput $11 '"0"))) (lambda '($15) $15) '('('"_logical_id" '199380)))) +(let $12 (DqStage '((DqCnUnionAll (TDqOutput $11 '"0"))) (lambda '($15) $15) '('('"_logical_id" '199386)))) (let $13 (ResPull! $1 $2 (Key) (DqCnResult (TDqOutput $12 '"0") '()) '('('type) '('autoref)) '"dq")) (return (Commit! (Commit! $13 $2) (DataSink '"solomon" '"local_solomon"))) ) diff --git a/ydb/library/yql/tests/sql/yt_native_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted b/ydb/library/yql/tests/sql/yt_native_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted deleted file mode 100644 index abd564f4a8c8..000000000000 --- a/ydb/library/yql/tests/sql/yt_native_file/part7/canondata/test.test_match_recognize-without_order_by--Results_/extracted +++ /dev/null @@ -1,5 +0,0 @@ -/program.sql:
: Fatal: Optimization - - /program.sql:
:8:1: Fatal: ydb/library/yql/core/yql_opt_match_recognize.cpp:xxx ExpandMatchRecognize(): requirement sortOrder->ChildrenSize() == 1 failed, message: Expect ORDER BY timestamp for MATCH_RECOGNIZE - select * from (select * from AS_TABLE($data) MATCH_RECOGNIZE( - ^ \ No newline at end of file From 2e6c50eb7e73a1b57670f6d58ef6f5cae7c743e6 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 18 Oct 2024 12:53:00 +0300 Subject: [PATCH 43/56] YQ-3766 / YQ-3767 Shared reading: fixs to stable (#10594) --- .../fq/libs/row_dispatcher/json_filter.cpp | 10 +++--- .../fq/libs/row_dispatcher/topic_session.cpp | 3 ++ .../libs/row_dispatcher/ut/json_filter_ut.cpp | 10 +++--- .../row_dispatcher/ut/topic_session_ut.cpp | 34 ++++++++++--------- ydb/tests/fq/yds/test_row_dispatcher.py | 10 +++--- 5 files changed, 37 insertions(+), 30 deletions(-) diff --git a/ydb/core/fq/libs/row_dispatcher/json_filter.cpp b/ydb/core/fq/libs/row_dispatcher/json_filter.cpp index 04cf6771118b..165612121d54 100644 --- a/ydb/core/fq/libs/row_dispatcher/json_filter.cpp +++ b/ydb/core/fq/libs/row_dispatcher/json_filter.cpp @@ -6,8 +6,9 @@ #include #include -#include #include +#include +#include namespace { @@ -274,6 +275,7 @@ class TJsonFilter::TImpl { str << OffsetFieldName << ", "; for (size_t i = 0; i < columnNames.size(); ++i) { TString columnType = columnTypes[i]; + TString columnName = NFq::EncloseAndEscapeString(columnNames[i], '`'); if (columnType == "Json") { columnType = "String"; } else if (columnType == "Optional") { @@ -281,11 +283,11 @@ class TJsonFilter::TImpl { } if (columnType.StartsWith("Optional")) { - str << "IF(" << columnNames[i] << " IS NOT NULL, Unwrap(CAST(" << columnNames[i] << " as " << columnType << ")), NULL)"; + str << "IF(" << columnName << " IS NOT NULL, Unwrap(CAST(" << columnName << " as " << columnType << ")), NULL)"; } else { - str << "Unwrap(CAST(" << columnNames[i] << " as " << columnType << "))"; + str << "Unwrap(CAST(" << columnName << " as " << columnType << "))"; } - str << " as " << columnNames[i] << ((i != columnNames.size() - 1) ? "," : ""); + str << " as " << columnName << ((i != columnNames.size() - 1) ? "," : ""); } str << " FROM Input;\n"; str << "$filtered = SELECT * FROM $fields " << whereFilter << ";\n"; diff --git a/ydb/core/fq/libs/row_dispatcher/topic_session.cpp b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp index 1a4c02a1f191..f87669d1f3ea 100644 --- a/ydb/core/fq/libs/row_dispatcher/topic_session.cpp +++ b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp @@ -722,12 +722,15 @@ void TTopicSession::Handle(NFq::TEvRowDispatcher::TEvStopSession::TPtr& ev) { LOG_ROW_DISPATCHER_DEBUG("Wrong ClientSettings"); return; } + auto& info = it->second; + UsedSize -= info.UsedSize; Clients.erase(it); ClientsWithoutPredicate.erase(ev->Sender); if (Clients.empty()) { StopReadSession(); } UpdateParser(); + SubscribeOnNextEvent(); } void CollectColumns(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams, TSet>& columns) { diff --git a/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp index 7682485b4644..1e2befea3778 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp @@ -54,16 +54,16 @@ Y_UNIT_TEST_SUITE(TJsonFilterTests) { Y_UNIT_TEST_F(Simple1, TFixture) { TMap result; MakeFilter( - {"a1", "a2"}, - {"String", "UInt64"}, + {"a1", "a2", "a@3"}, + {"String", "UInt64", "Optional"}, "where a2 > 100", [&](ui64 offset, const TString& json) { result[offset] = json; }); - Filter->Push({5}, {{"hello1"}, {"99"}}); - Filter->Push({6}, {{"hello2"}, {"101"}}); + Filter->Push({5}, {{"hello1"}, {"99"}, {"zapuskaem"}}); + Filter->Push({6}, {{"hello2"}, {"101"}, {"gusya"}}); UNIT_ASSERT_VALUES_EQUAL(1, result.size()); - UNIT_ASSERT_VALUES_EQUAL(R"({"a1":"hello2","a2":101})", result[6]); + UNIT_ASSERT_VALUES_EQUAL(R"({"a1":"hello2","a2":101,"a@3":"gusya"})", result[6]); } Y_UNIT_TEST_F(Simple2, TFixture) { diff --git a/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp index 0c94604792c8..d79edab956ae 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp @@ -336,30 +336,29 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { Init(topicName, 50); auto source = BuildSource(topicName); StartSession(ReadActorId1, source); - StartSession(ReadActorId2, source); + StartSession(ReadActorId2, source); // slow session size_t messagesSize = 5; - for (size_t i = 0; i < messagesSize; ++i) { - const std::vector data = { Json1 }; - PQWrite(data, topicName); - } + auto writeMessages = [&]() { + for (size_t i = 0; i < messagesSize; ++i) { + const std::vector data = { Json1 }; + PQWrite(data, topicName); + } + Sleep(TDuration::MilliSeconds(100)); + Runtime.DispatchEvents({}, Runtime.GetCurrentTime() - TDuration::MilliSeconds(1)); + }; + + writeMessages(); ExpectNewDataArrived({ReadActorId1, ReadActorId2}); auto readMessages = ReadMessages(ReadActorId1); UNIT_ASSERT(readMessages == messagesSize); // Reading from yds is stopped. - - for (size_t i = 0; i < messagesSize; ++i) { - const std::vector data = { Json1 }; - PQWrite(data, topicName); - } - Sleep(TDuration::MilliSeconds(100)); - Runtime.DispatchEvents({}, Runtime.GetCurrentTime() - TDuration::MilliSeconds(1)); + writeMessages(); readMessages = ReadMessages(ReadActorId1); UNIT_ASSERT(readMessages == 0); - readMessages = ReadMessages(ReadActorId2); UNIT_ASSERT(readMessages == messagesSize); @@ -369,11 +368,14 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { readMessages = ReadMessages(ReadActorId1); UNIT_ASSERT(readMessages == messagesSize); - readMessages = ReadMessages(ReadActorId2); - UNIT_ASSERT(readMessages == messagesSize); + writeMessages(); + StopSession(ReadActorId2, source); // delete slow client, clear unread buffer + Sleep(TDuration::MilliSeconds(100)); + Runtime.DispatchEvents({}, Runtime.GetCurrentTime() - TDuration::MilliSeconds(1)); + readMessages = ReadMessages(ReadActorId1); + UNIT_ASSERT(readMessages == messagesSize); StopSession(ReadActorId1, source); - StopSession(ReadActorId2, source); } Y_UNIT_TEST_F(TwoSessionsWithDifferentSchemes, TFixture) { diff --git a/ydb/tests/fq/yds/test_row_dispatcher.py b/ydb/tests/fq/yds/test_row_dispatcher.py index 7d59d333d620..a2b9bdab9fa7 100644 --- a/ydb/tests/fq/yds/test_row_dispatcher.py +++ b/ydb/tests/fq/yds/test_row_dispatcher.py @@ -297,17 +297,17 @@ def test_filter_missing_fields(self, kikimr, client): sql = Rf''' INSERT INTO {YDS_CONNECTION}.`{self.output_topic}` SELECT Cast(time as String) FROM {YDS_CONNECTION}.`{self.input_topic}` - WITH (format=json_each_row, SCHEMA (time UInt64 NOT NULL, data String, event String NOT NULL)) - WHERE data IS NULL;''' + WITH (format=json_each_row, SCHEMA (time UInt64 NOT NULL, `data@data` String, event String NOT NULL)) + WHERE `data@data` IS NULL;''' query_id = start_yds_query(kikimr, client, sql) wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) data = [ '{"time": 101, "event": "event1"}', - '{"time": 102, "data": null, "event": "event2"}', - '{"time": 103, "data": "", "event": "event2"}', - '{"time": 104, "data": "null", "event": "event2"}', + '{"time": 102, "data@data": null, "event": "event2"}', + '{"time": 103, "data@data": "", "event": "event2"}', + '{"time": 104, "data@data": "null", "event": "event2"}', ] self.write_stream(data) From 9a9711dc0c7a1d0a26a961a32986d6e18d5accfd Mon Sep 17 00:00:00 2001 From: yumkam Date: Fri, 18 Oct 2024 17:39:10 +0300 Subject: [PATCH 44/56] Streamlookup stable backports #7782 #9758 #9396 #10283 #10489 #10508 #7892 #10280 (#10341) Co-authored-by: dmasloff <74042473+dmasloff@users.noreply.github.com> Co-authored-by: Hor911 --- .../compute/dq_compute_actor_async_io.h | 33 ++--- .../dq/actors/compute/dq_compute_actor_impl.h | 45 +++++- .../dq_input_transform_lookup.cpp | 124 +++++++++++++--- .../yql/dq/expr_nodes/dq_expr_nodes.json | 2 +- ydb/library/yql/dq/proto/dq_tasks.proto | 1 + .../providers/dq/opt/physical_optimize.cpp | 2 +- .../actors/ut/yql_generic_lookup_actor_ut.cpp | 52 ++++--- .../generic/actors/yql_generic_base_actor.h | 4 +- .../actors/yql_generic_lookup_actor.cpp | 135 +++++++++++++----- .../generic/actors/yql_generic_lookup_actor.h | 1 + .../actors/yql_generic_provider_factories.cpp | 1 + .../generic/actors/yql_generic_read_actor.cpp | 10 +- .../actors/yql_generic_token_provider.cpp | 2 +- .../actors/yql_generic_token_provider.h | 4 +- .../connector/libcpp/ut_helpers/test_creds.h | 2 +- .../generic/provider/yql_generic_settings.h | 2 +- .../yt/actors/ut/yql_yt_lookup_actor_ut.cpp | 45 +++--- .../yt/actors/yql_yt_lookup_actor.cpp | 30 ++-- ydb/tests/fq/generic/test_streaming_join.py | 22 +++ ydb/tests/tools/fq_runner/kikimr_metrics.py | 2 +- 20 files changed, 375 insertions(+), 144 deletions(-) diff --git a/ydb/library/yql/dq/actors/compute/dq_compute_actor_async_io.h b/ydb/library/yql/dq/actors/compute/dq_compute_actor_async_io.h index 0122b27b150a..7edcd4afe91e 100644 --- a/ydb/library/yql/dq/actors/compute/dq_compute_actor_async_io.h +++ b/ydb/library/yql/dq/actors/compute/dq_compute_actor_async_io.h @@ -212,41 +212,26 @@ struct IDqAsyncLookupSource { NKikimr::NMiniKQL::TMKQLAllocator> >; struct TEvLookupRequest: NActors::TEventLocal { - TEvLookupRequest(std::shared_ptr alloc, TUnboxedValueMap&& request) - : Alloc(alloc) - , Request(std::move(request)) + TEvLookupRequest(std::weak_ptr request) + : Request(std::move(request)) { } - ~TEvLookupRequest() { - auto guard = Guard(*Alloc); - TKeyTypeHelper empty; - Request = TUnboxedValueMap{0, empty.GetValueHash(), empty.GetValueEqual()}; - } - std::shared_ptr Alloc; - TUnboxedValueMap Request; + std::weak_ptr Request; }; struct TEvLookupResult: NActors::TEventLocal { - TEvLookupResult(std::shared_ptr alloc, TUnboxedValueMap&& result) - : Alloc(alloc) - , Result(std::move(result)) + TEvLookupResult(std::weak_ptr result) + : Result(std::move(result)) { } - ~TEvLookupResult() { - auto guard = Guard(*Alloc.get()); - TKeyTypeHelper empty; - Result = TUnboxedValueMap{0, empty.GetValueHash(), empty.GetValueEqual()}; - } - - std::shared_ptr Alloc; - TUnboxedValueMap Result; + std::weak_ptr Result; }; virtual size_t GetMaxSupportedKeysInRequest() const = 0; //Initiate lookup for requested keys //Only one request at a time is allowed. Request must contain no more than GetMaxSupportedKeysInRequest() keys - //Upon completion, results are sent in TEvLookupResult event to the preconfigured actor - virtual void AsyncLookup(TUnboxedValueMap&& request) = 0; + //Upon completion, TEvLookupResult event is sent to the preconfigured actor + virtual void AsyncLookup(std::weak_ptr request) = 0; protected: ~IDqAsyncLookupSource() {} }; @@ -280,6 +265,7 @@ struct IDqAsyncIoFactory : public TThrRefBase { std::shared_ptr Alloc; std::shared_ptr KeyTypeHelper; NActors::TActorId ParentId; + ::NMonitoring::TDynamicCounterPtr TaskCounters; google::protobuf::Any LookupSource; //provider specific data source const NKikimr::NMiniKQL::TStructType* KeyType; const NKikimr::NMiniKQL::TStructType* PayloadType; @@ -312,6 +298,7 @@ struct IDqAsyncIoFactory : public TThrRefBase { const THashMap& SecureParams; const THashMap& TaskParams; const NActors::TActorId& ComputeActorId; + ::NMonitoring::TDynamicCounterPtr TaskCounters; const NKikimr::NMiniKQL::TTypeEnvironment& TypeEnv; const NKikimr::NMiniKQL::THolderFactory& HolderFactory; std::shared_ptr Alloc; diff --git a/ydb/library/yql/dq/actors/compute/dq_compute_actor_impl.h b/ydb/library/yql/dq/actors/compute/dq_compute_actor_impl.h index bcc9cc89784b..84b982755fbf 100644 --- a/ydb/library/yql/dq/actors/compute/dq_compute_actor_impl.h +++ b/ydb/library/yql/dq/actors/compute/dq_compute_actor_impl.h @@ -203,6 +203,7 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped MkqlMemoryQuota = taskCounters->GetCounter("MkqlMemoryQuota"); OutputChannelSize = taskCounters->GetCounter("OutputChannelSize"); SourceCpuTimeMs = taskCounters->GetCounter("SourceCpuTimeMs", true); + InputTransformCpuTimeMs = taskCounters->GetCounter("InputTransformCpuTimeMs", true); } } @@ -1308,6 +1309,7 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped .SecureParams = secureParams, .TaskParams = taskParams, .ComputeActorId = this->SelfId(), + .TaskCounters = TaskCounters, .TypeEnv = typeEnv, .HolderFactory = holderFactory, .Alloc = Alloc, @@ -1397,11 +1399,20 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped void OnNewAsyncInputDataArrived(const IDqComputeActorAsyncInput::TEvNewAsyncInputDataArrived::TPtr& ev) { Y_ABORT_UNLESS(SourcesMap.FindPtr(ev->Get()->InputIndex) || InputTransformsMap.FindPtr(ev->Get()->InputIndex)); - auto cpuTimeDelta = TakeSourceCpuTimeDelta(); - if (SourceCpuTimeMs) { - SourceCpuTimeMs->Add(cpuTimeDelta.MilliSeconds()); + { + auto cpuTimeDelta = TakeSourceCpuTimeDelta(); + if (SourceCpuTimeMs) { + SourceCpuTimeMs->Add(cpuTimeDelta.MilliSeconds()); + } + CpuTimeSpent += cpuTimeDelta; + } + { + auto cpuTimeDelta = TakeInputTransformCpuTimeDelta(); + if (InputTransformCpuTimeMs) { + InputTransformCpuTimeMs->Add(cpuTimeDelta.MilliSeconds()); + } + CpuTimeSpent += cpuTimeDelta; } - CpuTimeSpent += cpuTimeDelta; ContinueExecute(EResumeSource::CANewAsyncInput); } @@ -1596,6 +1607,21 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped return result; } + TDuration GetInputTransformCpuTime() const { + auto result = TDuration::Zero(); + for (auto& [inputIndex, sourceInfo] : InputTransformsMap) { + result += sourceInfo.AsyncInput->GetCpuTime(); + } + return result; + } + + TDuration TakeInputTransformCpuTimeDelta() { + auto newInputTransformCpuTime = GetInputTransformCpuTime(); + auto result = newInputTransformCpuTime - InputTransformCpuTime; + InputTransformCpuTime = newInputTransformCpuTime; + return result; + } + void FillStats(NDqProto::TDqComputeActorStats* dst, bool last) { if (RuntimeSettings.CollectNone()) { return; @@ -1605,7 +1631,7 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped ReportEventElapsedTime(); } - dst->SetCpuTimeUs(CpuTime.MicroSeconds()); + dst->SetCpuTimeUs(CpuTime.MicroSeconds() + SourceCpuTime.MicroSeconds() + InputTransformCpuTime.MicroSeconds()); dst->SetMaxMemoryUsage(MemoryLimits.MemoryQuotaManager->GetMaxMemorySize()); if (auto memProfileStats = GetMemoryProfileStats(); memProfileStats) { @@ -1638,10 +1664,13 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped } FillTaskRunnerStats(Task.GetId(), Task.GetStageId(), *taskStats, protoTask, RuntimeSettings.GetCollectStatsLevel()); - // More accurate cpu time counter: + auto cpuTimeUs = taskStats->ComputeCpuTime.MicroSeconds() + taskStats->BuildCpuTime.MicroSeconds(); if (TDerived::HasAsyncTaskRunner) { - protoTask->SetCpuTimeUs(CpuTime.MicroSeconds() + taskStats->ComputeCpuTime.MicroSeconds() + taskStats->BuildCpuTime.MicroSeconds()); + // Async TR is another actor, summarize CPU usage + cpuTimeUs += CpuTime.MicroSeconds(); } + // CpuTimeUs does include SourceCpuTime + protoTask->SetCpuTimeUs(cpuTimeUs + SourceCpuTime.MicroSeconds() + InputTransformCpuTime.MicroSeconds()); protoTask->SetSourceCpuTimeUs(SourceCpuTime.MicroSeconds()); ui64 ingressBytes = 0; @@ -1901,6 +1930,7 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped TDqComputeActorMetrics MetricsReporter; NWilson::TSpan ComputeActorSpan; TDuration SourceCpuTime; + TDuration InputTransformCpuTime; private: bool Running = true; TInstant LastSendStatsTime; @@ -1910,6 +1940,7 @@ class TDqComputeActorBase : public NActors::TActorBootstrapped ::NMonitoring::TDynamicCounters::TCounterPtr MkqlMemoryQuota; ::NMonitoring::TDynamicCounters::TCounterPtr OutputChannelSize; ::NMonitoring::TDynamicCounters::TCounterPtr SourceCpuTimeMs; + ::NMonitoring::TDynamicCounters::TCounterPtr InputTransformCpuTimeMs; THolder Stat; TDuration CpuTimeSpent; }; diff --git a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp index 98aabb3a9314..ccec56621dfc 100644 --- a/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp +++ b/ydb/library/yql/dq/actors/input_transforms/dq_input_transform_lookup.cpp @@ -33,6 +33,7 @@ class TInputTransformStreamLookupBase ui64 inputIndex, NUdf::TUnboxedValue inputFlow, NActors::TActorId computeActorId, + ::NMonitoring::TDynamicCounterPtr taskCounters, IDqAsyncIoFactory* factory, NDqProto::TDqInputTransformLookupSettings&& settings, TVector&& lookupInputIndexes, @@ -42,6 +43,7 @@ class TInputTransformStreamLookupBase const NMiniKQL::TStructType* lookupPayloadType, const NMiniKQL::TMultiType* outputRowType, TOutputRowColumnOrder&& outputRowColumnOrder, + size_t maxDelayedRows, size_t cacheLimit, std::chrono::seconds cacheTtl ) @@ -51,6 +53,7 @@ class TInputTransformStreamLookupBase , InputIndex(inputIndex) , InputFlow(std::move(inputFlow)) , ComputeActorId(std::move(computeActorId)) + , TaskCounters(taskCounters) , Factory(factory) , Settings(std::move(settings)) , LookupInputIndexes(std::move(lookupInputIndexes)) @@ -63,9 +66,9 @@ class TInputTransformStreamLookupBase , OutputRowColumnOrder(std::move(outputRowColumnOrder)) , InputFlowFetchStatus(NUdf::EFetchStatus::Yield) , LruCache(std::make_unique(cacheLimit, lookupKeyType)) + , MaxDelayedRows(maxDelayedRows) , CacheTtl(cacheTtl) , ReadyQueue(OutputRowType) - , WaitingForLookupResults(false) { Y_ABORT_UNLESS(Alloc); for (size_t i = 0; i != LookupInputIndexes.size(); ++i) { @@ -75,6 +78,7 @@ class TInputTransformStreamLookupBase Y_DEBUG_ABORT_UNLESS(OtherInputIndexes[i] < InputRowType->GetElementsCount()); } Y_DEBUG_ABORT_UNLESS(LookupInputIndexes.size() == LookupKeyType->GetMembersCount()); + InitMonCounters(taskCounters); } void Bootstrap() { @@ -83,6 +87,7 @@ class TInputTransformStreamLookupBase .Alloc = Alloc, .KeyTypeHelper = KeyTypeHelper, .ParentId = SelfId(), + .TaskCounters = TaskCounters, .LookupSource = Settings.GetRightSource().GetLookupSource(), .KeyType = LookupKeyType, .PayloadType = LookupPayloadType, @@ -91,8 +96,10 @@ class TInputTransformStreamLookupBase .MaxKeysInRequest = 1000 // TODO configure me }; auto guard = Guard(*Alloc); - LookupSource = Factory->CreateDqLookupSource(Settings.GetRightSource().GetProviderName(), std::move(lookupSourceArgs)); - RegisterWithSameMailbox(LookupSource.second); + auto [lookupSource, lookupSourceActor] = Factory->CreateDqLookupSource(Settings.GetRightSource().GetProviderName(), std::move(lookupSourceArgs)); + MaxKeysInRequest = lookupSource->GetMaxSupportedKeysInRequest(); + LookupSourceId = RegisterWithSameMailbox(lookupSourceActor); + KeysForLookup = std::make_shared(MaxKeysInRequest, KeyTypeHelper->GetValueHash(), KeyTypeHelper->GetValueEqual()); } protected: virtual NUdf::EFetchStatus FetchWideInputValue(NUdf::TUnboxedValue* inputRowItems) = 0; @@ -101,8 +108,17 @@ class TInputTransformStreamLookupBase private: //events STRICT_STFUNC(StateFunc, hFunc(IDqAsyncLookupSource::TEvLookupResult, Handle); + hFunc(IDqComputeActorAsyncInput::TEvAsyncInputError, Handle); ) + void Handle(IDqComputeActorAsyncInput::TEvAsyncInputError::TPtr ev) { + auto evptr = ev->Get(); + Send(ComputeActorId, new IDqComputeActorAsyncInput::TEvAsyncInputError( + InputIndex, + evptr->Issues, + evptr->FatalCode)); + } + void AddReadyQueue(NUdf::TUnboxedValue& lookupKey, NUdf::TUnboxedValue& inputOther, NUdf::TUnboxedValue *lookupPayload) { NUdf::TUnboxedValue* outputRowItems; NUdf::TUnboxedValue outputRow = HolderFactory.CreateDirectArrayHolder(OutputRowColumnOrder.size(), outputRowItems); @@ -132,21 +148,31 @@ class TInputTransformStreamLookupBase } void Handle(IDqAsyncLookupSource::TEvLookupResult::TPtr ev) { + auto startCycleCount = GetCycleCountFast(); + if (!KeysForLookup) { + return; + } auto guard = BindAllocator(); const auto now = std::chrono::steady_clock::now(); - auto lookupResult = std::move(ev->Get()->Result); + auto lookupResult = ev->Get()->Result.lock(); + Y_ABORT_UNLESS(lookupResult == KeysForLookup); for (; !AwaitingQueue.empty(); AwaitingQueue.pop_front()) { auto& [lookupKey, inputOther] = AwaitingQueue.front(); - auto lookupPayload = lookupResult.FindPtr(lookupKey); + auto lookupPayload = lookupResult->FindPtr(lookupKey); if (lookupPayload == nullptr) { continue; } AddReadyQueue(lookupKey, inputOther, lookupPayload); } - for (auto&& [k, v]: lookupResult) { + for (auto&& [k, v]: *lookupResult) { LruCache->Update(NUdf::TUnboxedValue(const_cast(k)), std::move(v), now + CacheTtl); } - WaitingForLookupResults = false; + KeysForLookup->clear(); + auto deltaTime = GetCpuTimeDelta(startCycleCount); + CpuTime += deltaTime; + if (CpuTimeUs) { + CpuTimeUs->Add(deltaTime.MicroSeconds()); + } Send(ComputeActorId, new TEvNewAsyncInputDataArrived{InputIndex}); } @@ -165,9 +191,10 @@ class TInputTransformStreamLookupBase } void PassAway() final { - Send(LookupSource.second->SelfId(), new NActors::TEvents::TEvPoison{}); + Send(LookupSourceId, new NActors::TEvents::TEvPoison{}); auto guard = BindAllocator(); //All resources, held by this class, that have been created with mkql allocator, must be deallocated here + KeysForLookup.reset(); InputFlow.Clear(); KeyTypeHelper.reset(); decltype(AwaitingQueue){}.swap(AwaitingQueue); @@ -184,47 +211,89 @@ class TInputTransformStreamLookupBase i64 GetAsyncInputData(NKikimr::NMiniKQL::TUnboxedValueBatch& batch, TMaybe&, bool& finished, i64 freeSpace) final { Y_UNUSED(freeSpace); + auto startCycleCount = GetCycleCountFast(); auto guard = BindAllocator(); DrainReadyQueue(batch); - if (InputFlowFetchStatus != NUdf::EFetchStatus::Finish && !WaitingForLookupResults) { - NUdf::TUnboxedValue* inputRowItems; - NUdf::TUnboxedValue inputRow = HolderFactory.CreateDirectArrayHolder(InputRowType->GetElementsCount(), inputRowItems); + if (InputFlowFetchStatus != NUdf::EFetchStatus::Finish && KeysForLookup->empty()) { + Y_DEBUG_ABORT_UNLESS(AwaitingQueue.empty()); + NUdf::TUnboxedValue* inputRowItems; + NUdf::TUnboxedValue inputRow = HolderFactory.CreateDirectArrayHolder(InputRowType->GetElementsCount(), inputRowItems); const auto now = std::chrono::steady_clock::now(); - const auto maxKeysInRequest = LookupSource.first->GetMaxSupportedKeysInRequest(); - IDqAsyncLookupSource::TUnboxedValueMap keysForLookup{maxKeysInRequest, KeyTypeHelper->GetValueHash(), KeyTypeHelper->GetValueEqual()}; LruCache->Prune(now); + size_t rowLimit = std::numeric_limits::max(); + size_t row = 0; while ( - (keysForLookup.size() < maxKeysInRequest) && + row < rowLimit && + (KeysForLookup->size() < MaxKeysInRequest) && ((InputFlowFetchStatus = FetchWideInputValue(inputRowItems)) == NUdf::EFetchStatus::Ok)) { NUdf::TUnboxedValue* keyItems; NUdf::TUnboxedValue key = HolderFactory.CreateDirectArrayHolder(LookupInputIndexes.size(), keyItems); NUdf::TUnboxedValue* otherItems; NUdf::TUnboxedValue other = HolderFactory.CreateDirectArrayHolder(OtherInputIndexes.size(), otherItems); + bool nullsInKey = false; for (size_t i = 0; i != LookupInputIndexes.size(); ++i) { keyItems[i] = inputRowItems[LookupInputIndexes[i]]; + if (!keyItems[i]) { + nullsInKey = true; + } } for (size_t i = 0; i != OtherInputIndexes.size(); ++i) { otherItems[i] = inputRowItems[OtherInputIndexes[i]]; } - if (auto lookupPayload = LruCache->Get(key, now)) { + if (nullsInKey) { + AddReadyQueue(key, other, nullptr); + } else if (auto lookupPayload = LruCache->Get(key, now)) { AddReadyQueue(key, other, &*lookupPayload); } else { + if (AwaitingQueue.empty()) { + // look ahead at most MaxDelayedRows after first missing + rowLimit = row + MaxDelayedRows; + } AwaitingQueue.emplace_back(key, std::move(other)); - keysForLookup.emplace(std::move(key), NUdf::TUnboxedValue{}); + KeysForLookup->emplace(std::move(key), NUdf::TUnboxedValue{}); } + ++row; } - if (!keysForLookup.empty()) { - LookupSource.first->AsyncLookup(std::move(keysForLookup)); - WaitingForLookupResults = true; + if (Batches && (!KeysForLookup->empty() || !ReadyQueue.RowCount())) { + Batches->Inc(); + LruHits->Add(ReadyQueue.RowCount()); + LruMiss->Add(AwaitingQueue.size()); + } + if (!KeysForLookup->empty()) { + Send(LookupSourceId, new IDqAsyncLookupSource::TEvLookupRequest(KeysForLookup)); } DrainReadyQueue(batch); } + auto deltaTime = GetCpuTimeDelta(startCycleCount); + CpuTime += deltaTime; + if (CpuTimeUs) { + CpuTimeUs->Add(deltaTime.MicroSeconds()); + } finished = IsFinished(); return AwaitingQueue.size(); } + void InitMonCounters(const ::NMonitoring::TDynamicCounterPtr& taskCounters) { + if (!taskCounters) { + return; + } + auto component = taskCounters->GetSubgroup("component", "Lookup"); + LruHits = component->GetCounter("Hits"); + LruMiss = component->GetCounter("Miss"); + CpuTimeUs = component->GetCounter("CpuUs"); + Batches = component->GetCounter("Batches"); + } + + static TDuration GetCpuTimeDelta(ui64 startCycleCount) { + return TDuration::Seconds(NHPTimer::GetSeconds(GetCycleCountFast() - startCycleCount)); + } + + TDuration GetCpuTime() override { + return CpuTime; + } + TMaybe ExtraData() override { google::protobuf::Any result; //TODO fill me @@ -256,10 +325,12 @@ class TInputTransformStreamLookupBase ui64 InputIndex; // NYql::NDq::IDqComputeActorAsyncInput NUdf::TUnboxedValue InputFlow; const NActors::TActorId ComputeActorId; + ::NMonitoring::TDynamicCounterPtr TaskCounters; IDqAsyncIoFactory::TPtr Factory; NDqProto::TDqInputTransformLookupSettings Settings; protected: - std::pair LookupSource; + NActors::TActorId LookupSourceId; + size_t MaxKeysInRequest; const TVector LookupInputIndexes; const TVector OtherInputIndexes; const NMiniKQL::TMultiType* const InputRowType; @@ -271,13 +342,20 @@ class TInputTransformStreamLookupBase NUdf::EFetchStatus InputFlowFetchStatus; std::unique_ptr LruCache; + size_t MaxDelayedRows; std::chrono::seconds CacheTtl; using TInputKeyOtherPair = std::pair; using TAwaitingQueue = std::deque>; //input row split in two parts: key columns and other columns TAwaitingQueue AwaitingQueue; NKikimr::NMiniKQL::TUnboxedValueBatch ReadyQueue; - std::atomic WaitingForLookupResults; NYql::NDq::TDqAsyncStats IngressStats; + std::shared_ptr KeysForLookup; + + ::NMonitoring::TDynamicCounters::TCounterPtr LruHits; + ::NMonitoring::TDynamicCounters::TCounterPtr LruMiss; + ::NMonitoring::TDynamicCounters::TCounterPtr CpuTimeUs; + ::NMonitoring::TDynamicCounters::TCounterPtr Batches; + TDuration CpuTime; }; class TInputTransformStreamLookupWide: public TInputTransformStreamLookupBase { @@ -524,6 +602,7 @@ std::pair CreateInputTransformStre args.InputIndex, args.TransformInput, args.ComputeActorId, + args.TaskCounters, factory, std::move(settings), std::move(lookupKeyInputIndexes), @@ -533,6 +612,7 @@ std::pair CreateInputTransformStre lookupPayloadType, outputRowType, std::move(outputColumnsOrder), + settings.GetMaxDelayedRows(), settings.GetCacheLimit(), std::chrono::seconds(settings.GetCacheTtlSeconds()) ) : @@ -543,6 +623,7 @@ std::pair CreateInputTransformStre args.InputIndex, args.TransformInput, args.ComputeActorId, + args.TaskCounters, factory, std::move(settings), std::move(lookupKeyInputIndexes), @@ -552,6 +633,7 @@ std::pair CreateInputTransformStre lookupPayloadType, outputRowType, std::move(outputColumnsOrder), + settings.GetMaxDelayedRows(), settings.GetCacheLimit(), std::chrono::seconds(settings.GetCacheTtlSeconds()) ); diff --git a/ydb/library/yql/dq/expr_nodes/dq_expr_nodes.json b/ydb/library/yql/dq/expr_nodes/dq_expr_nodes.json index 0d29ed67202f..6040ff143ea7 100644 --- a/ydb/library/yql/dq/expr_nodes/dq_expr_nodes.json +++ b/ydb/library/yql/dq/expr_nodes/dq_expr_nodes.json @@ -188,7 +188,7 @@ {"Index": 6, "Name": "LeftJoinKeyNames", "Type": "TCoAtomList"}, {"Index": 7, "Name": "RightJoinKeyNames", "Type": "TCoAtomList"}, {"Index": 8, "Name": "TTL", "Type": "TCoAtom"}, - {"Index": 9, "Name": "MaxDelay", "Type": "TCoAtom"}, + {"Index": 9, "Name": "MaxDelayedRows", "Type": "TCoAtom"}, {"Index": 10, "Name": "MaxCachedRows", "Type": "TCoAtom"} ] }, diff --git a/ydb/library/yql/dq/proto/dq_tasks.proto b/ydb/library/yql/dq/proto/dq_tasks.proto index da8bdf36122e..76006b2f6049 100644 --- a/ydb/library/yql/dq/proto/dq_tasks.proto +++ b/ydb/library/yql/dq/proto/dq_tasks.proto @@ -186,6 +186,7 @@ message TDqInputTransformLookupSettings { bytes NarrowOutputRowType = 8; //Serialized struct type uint64 CacheLimit = 9; uint64 CacheTtlSeconds = 10; + uint64 MaxDelayedRows = 11; } message TDqTask { diff --git a/ydb/library/yql/providers/dq/opt/physical_optimize.cpp b/ydb/library/yql/providers/dq/opt/physical_optimize.cpp index 0eba2f3be213..6406527a9576 100644 --- a/ydb/library/yql/providers/dq/opt/physical_optimize.cpp +++ b/ydb/library/yql/providers/dq/opt/physical_optimize.cpp @@ -227,7 +227,7 @@ class TDqsPhysicalOptProposalTransformer : public TOptimizeTransformerBase { .RightJoinKeyNames(join.RightJoinKeyNames()) .TTL(ctx.NewAtom(pos, 300)) //TODO configure me .MaxCachedRows(ctx.NewAtom(pos, 1'000'000)) //TODO configure me - .MaxDelay(ctx.NewAtom(pos, 1'000'000)) //Configure me + .MaxDelayedRows(ctx.NewAtom(pos, 1'000'000)) //Configure me .Done(); auto lambda = Build(ctx, pos) diff --git a/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp b/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp index d7b77b8bbda6..619f10a99bcf 100644 --- a/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp +++ b/ydb/library/yql/providers/generic/actors/ut/yql_generic_lookup_actor_ut.cpp @@ -30,21 +30,31 @@ Y_UNIT_TEST_SUITE(GenericProviderLookupActor) { return result; } - //Simple actor to call IDqAsyncLookupSource::AsyncLookup from an actor system's thread + // Simple actor to call IDqAsyncLookupSource::AsyncLookup from an actor system's thread class TCallLookupActor: public TActorBootstrapped { public: TCallLookupActor( std::shared_ptr alloc, - NYql::NDq::IDqAsyncLookupSource* lookupSource, - NYql::NDq::IDqAsyncLookupSource::TUnboxedValueMap&& request) + const NActors::TActorId& lookupActor, + std::shared_ptr request) : Alloc(alloc) - , LookupSource(lookupSource) - , Request(std::move(request)) + , LookupActor(lookupActor) + , Request(request) { } void Bootstrap() { - LookupSource->AsyncLookup(std::move(Request)); + auto ev = new NYql::NDq::IDqAsyncLookupSource::TEvLookupRequest(Request); + TActivationContext::ActorSystem()->Send(new NActors::IEventHandle(LookupActor, SelfId(), ev)); + } + + void PassAway() override { + auto guard = Guard(*Alloc); + Request.reset(); + } + + ~TCallLookupActor() { + PassAway(); } private: @@ -52,8 +62,8 @@ Y_UNIT_TEST_SUITE(GenericProviderLookupActor) { private: std::shared_ptr Alloc; - NYql::NDq::IDqAsyncLookupSource* LookupSource; - NYql::NDq::IDqAsyncLookupSource::TUnboxedValueMap Request; + const NActors::TActorId LookupActor; + std::shared_ptr Request; }; Y_UNIT_TEST(Lookup) { @@ -136,7 +146,7 @@ Y_UNIT_TEST_SUITE(GenericProviderLookupActor) { .AddResponse( MakeRecordBatch( MakeArray("id", {0, 1, 2}, arrow::uint64()), - MakeArray("optional_id", {100, 101, 103}, arrow::uint64()), //the last value is intentially wrong + MakeArray("optional_id", {100, 101, 103}, arrow::uint64()), // the last value is intentially wrong MakeArray("string_value", {"a", "b", "c"}, arrow::utf8()) ), NewSuccess() @@ -166,6 +176,7 @@ Y_UNIT_TEST_SUITE(GenericProviderLookupActor) { connectorMock, std::make_shared(), edge, + nullptr, alloc, keyTypeHelper, std::move(lookupSourceSettings), @@ -174,44 +185,45 @@ Y_UNIT_TEST_SUITE(GenericProviderLookupActor) { typeEnv, holderFactory, 1'000'000); - runtime.Register(actor); + auto lookupActor = runtime.Register(actor); - NYql::NDq::IDqAsyncLookupSource::TUnboxedValueMap request(3, keyTypeHelper->GetValueHash(), keyTypeHelper->GetValueEqual()); + auto request = std::make_shared(3, keyTypeHelper->GetValueHash(), keyTypeHelper->GetValueEqual()); for (size_t i = 0; i != 3; ++i) { NYql::NUdf::TUnboxedValue* keyItems; auto key = holderFactory.CreateDirectArrayHolder(2, keyItems); keyItems[0] = NYql::NUdf::TUnboxedValuePod(ui64(i)); keyItems[1] = NYql::NUdf::TUnboxedValuePod(ui64(100 + i)); - request.emplace(std::move(key), NYql::NUdf::TUnboxedValue{}); + request->emplace(std::move(key), NYql::NUdf::TUnboxedValue{}); } - guard.Release(); //let actors use alloc + guard.Release(); // let actors use alloc - auto callLookupActor = new TCallLookupActor(alloc, lookupSource, std::move(request)); + auto callLookupActor = new TCallLookupActor(alloc, lookupActor, request); runtime.Register(callLookupActor); auto ev = runtime.GrabEdgeEventRethrow(edge); auto guard2 = Guard(*alloc.get()); - auto lookupResult = std::move(ev->Get()->Result); + auto lookupResult = ev->Get()->Result.lock(); + UNIT_ASSERT(lookupResult); - UNIT_ASSERT_EQUAL(3, lookupResult.size()); + UNIT_ASSERT_EQUAL(3, lookupResult->size()); { - const auto* v = lookupResult.FindPtr(CreateStructValue(holderFactory, {0, 100})); + const auto* v = lookupResult->FindPtr(CreateStructValue(holderFactory, {0, 100})); UNIT_ASSERT(v); NYql::NUdf::TUnboxedValue val = v->GetElement(0); UNIT_ASSERT(val.AsStringRef() == TStringBuf("a")); } { - const auto* v = lookupResult.FindPtr(CreateStructValue(holderFactory, {1, 101})); + const auto* v = lookupResult->FindPtr(CreateStructValue(holderFactory, {1, 101})); UNIT_ASSERT(v); NYql::NUdf::TUnboxedValue val = v->GetElement(0); UNIT_ASSERT(val.AsStringRef() == TStringBuf("b")); } { - const auto* v = lookupResult.FindPtr(CreateStructValue(holderFactory, {2, 102})); + const auto* v = lookupResult->FindPtr(CreateStructValue(holderFactory, {2, 102})); UNIT_ASSERT(v); UNIT_ASSERT(!*v); } } -} //Y_UNIT_TEST_SUITE(GenericProviderLookupActor) +} // Y_UNIT_TEST_SUITE(GenericProviderLookupActor) diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_base_actor.h b/ydb/library/yql/providers/generic/actors/yql_generic_base_actor.h index c38fa9c9f0d4..ec7810ad163e 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_base_actor.h +++ b/ydb/library/yql/providers/generic/actors/yql_generic_base_actor.h @@ -10,7 +10,7 @@ namespace NYql::NDq { template class TGenericBaseActor: public NActors::TActorBootstrapped { - protected: //Events + protected: // Events // Event ids enum EEventIds: ui32 { EvBegin = EventSpaceBegin(NActors::TEvents::ES_PRIVATE), @@ -89,7 +89,7 @@ namespace NYql::NDq { NConnector::NApi::TError Error; }; - protected: //TODO move common logic here + protected: // TODO move common logic here }; } // namespace NYql::NDq diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp b/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp index 21090f61e5b8..d4c8b242882a 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp +++ b/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.cpp @@ -44,11 +44,11 @@ namespace NYql::NDq { template T ExtractFromConstFuture(const NThreading::TFuture& f) { - //We want to avoid making a copy of data stored in a future. - //But there is no direct way to extract data from a const future5 - //So, we make a copy of the future, that is cheap. Then, extract the value from this copy. - //It destructs the value in the original future, but this trick is legal and documented here: - //https://docs.yandex-team.ru/arcadia-cpp/cookbook/concurrency + // We want to avoid making a copy of data stored in a future. + // But there is no direct way to extract data from a const future5 + // So, we make a copy of the future, that is cheap. Then, extract the value from this copy. + // It destructs the value in the original future, but this trick is legal and documented here: + // https://docs.yandex-team.ru/arcadia-cpp/cookbook/concurrency return NThreading::TFuture(f).ExtractValueSync(); } @@ -64,6 +64,7 @@ namespace NYql::NDq { NConnector::IClient::TPtr connectorClient, TGenericTokenProvider::TPtr tokenProvider, NActors::TActorId&& parentId, + ::NMonitoring::TDynamicCounterPtr taskCounters, std::shared_ptr alloc, std::shared_ptr keyTypeHelper, NYql::Generic::TLookupSource&& lookupSource, @@ -84,19 +85,34 @@ namespace NYql::NDq { , HolderFactory(holderFactory) , ColumnDestinations(CreateColumnDestination()) , MaxKeysInRequest(maxKeysInRequest) - , Request( - 0, - KeyTypeHelper->GetValueHash(), - KeyTypeHelper->GetValueEqual()) { + InitMonCounters(taskCounters); } ~TGenericLookupActor() { + Free(); + } + + private: + void Free() { auto guard = Guard(*Alloc); + Request.reset(); KeyTypeHelper.reset(); - TKeyTypeHelper empty; - Request = IDqAsyncLookupSource::TUnboxedValueMap(0, empty.GetValueHash(), empty.GetValueEqual()); } + void InitMonCounters(const ::NMonitoring::TDynamicCounterPtr& taskCounters) { + if (!taskCounters) { + return; + } + auto component = taskCounters->GetSubgroup("component", "LookupSrc"); + Count = component->GetCounter("Reqs"); + Keys = component->GetCounter("Keys"); + ResultChunks = component->GetCounter("Chunks"); + ResultRows = component->GetCounter("Rows"); + ResultBytes = component->GetCounter("Bytes"); + AnswerTime = component->GetCounter("AnswerMs"); + CpuTime = component->GetCounter("CpuUs"); + } + public: void Bootstrap() { auto dsi = LookupSource.data_source_instance(); @@ -112,17 +128,22 @@ namespace NYql::NDq { static constexpr char ActorName[] = "GENERIC_PROVIDER_LOOKUP_ACTOR"; - private: //IDqAsyncLookupSource + private: // IDqAsyncLookupSource size_t GetMaxSupportedKeysInRequest() const override { return MaxKeysInRequest; } - void AsyncLookup(IDqAsyncLookupSource::TUnboxedValueMap&& request) override { + void AsyncLookup(std::weak_ptr request) override { auto guard = Guard(*Alloc); - CreateRequest(std::move(request)); + CreateRequest(request.lock()); + } + void PassAway() override { + Free(); + TBase::PassAway(); } - private: //events + private: // events STRICT_STFUNC(StateFunc, + hFunc(TEvLookupRequest, Handle); hFunc(TEvListSplitsIterator, Handle); hFunc(TEvListSplitsPart, Handle); hFunc(TEvReadSplitsIterator, Handle); @@ -189,19 +210,43 @@ namespace NYql::NDq { FinalizeRequest(); } - void Handle(TEvError::TPtr) { - FinalizeRequest(); + void Handle(TEvError::TPtr ev) { + auto actorSystem = TActivationContext::ActorSystem(); + auto error = ev->Get()->Error; + auto errEv = std::make_unique( + -1, + NConnector::ErrorToIssues(error), + NConnector::ErrorToDqStatus(error)); + actorSystem->Send(new NActors::IEventHandle(ParentId, SelfId(), errEv.release())); } void Handle(NActors::TEvents::TEvPoison::TPtr) { PassAway(); } + void Handle(TEvLookupRequest::TPtr ev) { + auto guard = Guard(*Alloc); + CreateRequest(ev->Get()->Request.lock()); + } + private: - void CreateRequest(IDqAsyncLookupSource::TUnboxedValueMap&& request) { - YQL_CLOG(DEBUG, ProviderGeneric) << "ActorId=" << SelfId() << " Got LookupRequest for " << request.size() << " keys"; - Y_ABORT_IF(InProgress); - Y_ABORT_IF(request.size() == 0 || request.size() > MaxKeysInRequest); + static TDuration GetCpuTimeDelta(ui64 startCycleCount) { + return TDuration::Seconds(NHPTimer::GetSeconds(GetCycleCountFast() - startCycleCount)); + } + + void CreateRequest(std::shared_ptr request) { + if (!request) { + return; + } + auto startCycleCount = GetCycleCountFast(); + SentTime = TInstant::Now(); + YQL_CLOG(DEBUG, ProviderGeneric) << "ActorId=" << SelfId() << " Got LookupRequest for " << request->size() << " keys"; + Y_ABORT_IF(request->size() == 0 || request->size() > MaxKeysInRequest); + + if (Count) { + Count->Inc(); + Keys->Add(request->size()); + } Request = std::move(request); NConnector::NApi::TListSplitsRequest splitRequest; @@ -224,6 +269,9 @@ namespace NYql::NDq { SendError(actorSystem, selfId, result.Status); } }); + if (CpuTime) { + CpuTime->Add(GetCpuTimeDelta(startCycleCount).MicroSeconds()); + } } void ReadNextData() { @@ -251,9 +299,17 @@ namespace NYql::NDq { } void ProcessReceivedData(const NConnector::NApi::TReadSplitsResponse& resp) { + auto startCycleCount = GetCycleCountFast(); Y_ABORT_UNLESS(resp.payload_case() == NConnector::NApi::TReadSplitsResponse::PayloadCase::kArrowIpcStreaming); + if (ResultChunks) { + ResultChunks->Inc(); + if (resp.has_stats()) { + ResultRows->Add(resp.stats().rows()); + ResultBytes->Add(resp.stats().bytes()); + } + } auto guard = Guard(*Alloc); - NKikimr::NArrow::NSerialization::TSerializerContainer deser = NKikimr::NArrow::NSerialization::TSerializerContainer::GetDefaultSerializer(); //todo move to class' member + NKikimr::NArrow::NSerialization::TSerializerContainer deser = NKikimr::NArrow::NSerialization::TSerializerContainer::GetDefaultSerializer(); // todo move to class' member const auto& data = deser->Deserialize(resp.arrow_ipc_streaming()); Y_ABORT_UNLESS(data.ok()); const auto& value = data.ValueOrDie(); @@ -273,20 +329,26 @@ namespace NYql::NDq { for (size_t j = 0; j != columns.size(); ++j) { (ColumnDestinations[j].first == EColumnDestination::Key ? keyItems : outputItems)[ColumnDestinations[j].second] = columns[j][i]; } - if (auto* v = Request.FindPtr(key)) { - *v = std::move(output); //duplicates will be overwritten + if (auto* v = Request->FindPtr(key)) { + *v = std::move(output); // duplicates will be overwritten } } + if (CpuTime) { + CpuTime->Add(GetCpuTimeDelta(startCycleCount).MicroSeconds()); + } } void FinalizeRequest() { - YQL_CLOG(DEBUG, ProviderGeneric) << "Sending lookup results for " << Request.size() << " keys"; + YQL_CLOG(DEBUG, ProviderGeneric) << "Sending lookup results for " << Request->size() << " keys"; auto guard = Guard(*Alloc); - auto ev = new IDqAsyncLookupSource::TEvLookupResult(Alloc, std::move(Request)); + auto ev = new IDqAsyncLookupSource::TEvLookupResult(Request); + if (AnswerTime) { + AnswerTime->Add((TInstant::Now() - SentTime).MilliSeconds()); + } + Request.reset(); TActivationContext::ActorSystem()->Send(new NActors::IEventHandle(ParentId, SelfId(), ev)); LookupResult = {}; ReadSplitsIterator = {}; - InProgress = false; } static void SendError(NActors::TActorSystem* actorSystem, const NActors::TActorId& selfId, const NConnector::NApi::TError& error) { @@ -352,7 +414,7 @@ namespace NYql::NDq { select.mutable_from()->Settable(LookupSource.table()); NConnector::NApi::TPredicate_TDisjunction disjunction; - for (const auto& [k, _] : Request) { + for (const auto& [k, _] : *Request) { NConnector::NApi::TPredicate_TConjunction conjunction; for (ui32 c = 0; c != KeyType->GetMembersCount(); ++c) { NConnector::NApi::TPredicate_TComparison eq; @@ -378,20 +440,28 @@ namespace NYql::NDq { const NYql::Generic::TLookupSource LookupSource; const NKikimr::NMiniKQL::TStructType* const KeyType; const NKikimr::NMiniKQL::TStructType* const PayloadType; - const NKikimr::NMiniKQL::TStructType* const SelectResultType; //columns from KeyType + PayloadType + const NKikimr::NMiniKQL::TStructType* const SelectResultType; // columns from KeyType + PayloadType const NKikimr::NMiniKQL::THolderFactory& HolderFactory; const std::vector> ColumnDestinations; const size_t MaxKeysInRequest; - std::atomic_bool InProgress; - IDqAsyncLookupSource::TUnboxedValueMap Request; - NConnector::IReadSplitsStreamIterator::TPtr ReadSplitsIterator; //TODO move me to TEvReadSplitsPart + std::shared_ptr Request; + NConnector::IReadSplitsStreamIterator::TPtr ReadSplitsIterator; // TODO move me to TEvReadSplitsPart NKikimr::NMiniKQL::TKeyPayloadPairVector LookupResult; + ::NMonitoring::TDynamicCounters::TCounterPtr Count; + ::NMonitoring::TDynamicCounters::TCounterPtr Keys; + ::NMonitoring::TDynamicCounters::TCounterPtr ResultRows; + ::NMonitoring::TDynamicCounters::TCounterPtr ResultBytes; + ::NMonitoring::TDynamicCounters::TCounterPtr ResultChunks; + ::NMonitoring::TDynamicCounters::TCounterPtr AnswerTime; + ::NMonitoring::TDynamicCounters::TCounterPtr CpuTime; + TInstant SentTime; }; std::pair CreateGenericLookupActor( NConnector::IClient::TPtr connectorClient, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, NActors::TActorId parentId, + ::NMonitoring::TDynamicCounterPtr taskCounters, std::shared_ptr alloc, std::shared_ptr keyTypeHelper, NYql::Generic::TLookupSource&& lookupSource, @@ -407,6 +477,7 @@ namespace NYql::NDq { connectorClient, std::move(tokenProvider), std::move(parentId), + taskCounters, alloc, keyTypeHelper, std::move(lookupSource), diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.h b/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.h index 9f8c0c268f23..128964b1553f 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.h +++ b/ydb/library/yql/providers/generic/actors/yql_generic_lookup_actor.h @@ -15,6 +15,7 @@ namespace NYql::NDq { NConnector::IClient::TPtr connectorClient, ISecuredServiceAccountCredentialsFactory::TPtr credentialsFactory, NActors::TActorId parentId, + ::NMonitoring::TDynamicCounterPtr taskCounters, std::shared_ptr alloc, std::shared_ptr keyTypeHelper, NYql::Generic::TLookupSource&& lookupSource, diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_provider_factories.cpp b/ydb/library/yql/providers/generic/actors/yql_generic_provider_factories.cpp index e9b2b8bf8bd5..88c5e656ee82 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_provider_factories.cpp +++ b/ydb/library/yql/providers/generic/actors/yql_generic_provider_factories.cpp @@ -22,6 +22,7 @@ namespace NYql::NDq { genericClient, credentialsFactory, std::move(args.ParentId), + args.TaskCounters, args.Alloc, args.KeyTypeHelper, std::move(lookupSource), diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp b/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp index 4c664f665826..f3b32eb2d27a 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp +++ b/ydb/library/yql/providers/generic/actors/yql_generic_read_actor.cpp @@ -27,11 +27,11 @@ namespace NYql::NDq { template T ExtractFromConstFuture(const NThreading::TFuture& f) { - //We want to avoid making a copy of data stored in a future. - //But there is no direct way to extract data from a const future - //So, we make a copy of the future, that is cheap. Then, extract the value from this copy. - //It destructs the value in the original future, but this trick is legal and documented here: - //https://docs.yandex-team.ru/arcadia-cpp/cookbook/concurrency + // We want to avoid making a copy of data stored in a future. + // But there is no direct way to extract data from a const future + // So, we make a copy of the future, that is cheap. Then, extract the value from this copy. + // It destructs the value in the original future, but this trick is legal and documented here: + // https://docs.yandex-team.ru/arcadia-cpp/cookbook/concurrency return NThreading::TFuture(f).ExtractValueSync(); } diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.cpp b/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.cpp index f2651cac0d1c..bbb6e1555c5f 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.cpp +++ b/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.cpp @@ -70,4 +70,4 @@ namespace NYql::NDq { } return std::make_unique(); } -} //namespace NYql::NDq +} // namespace NYql::NDq diff --git a/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.h b/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.h index 6ff0d1fd578d..c656e3a38daf 100644 --- a/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.h +++ b/ydb/library/yql/providers/generic/actors/yql_generic_token_provider.h @@ -13,7 +13,7 @@ namespace NYql::NDq { class TGenericTokenProvider { public: using TPtr = std::unique_ptr; - TGenericTokenProvider() = default; //No auth required + TGenericTokenProvider() = default; // No auth required TGenericTokenProvider(const TString& staticIamToken); TGenericTokenProvider( const TString& serviceAccountId, @@ -34,4 +34,4 @@ namespace NYql::NDq { const TString& staticIamToken, const TString& serviceAccountId, const TString& ServiceAccountIdSignature, const ISecuredServiceAccountCredentialsFactory::TPtr& credentialsFactory); -} //namespace NYql::NDq +} // namespace NYql::NDq diff --git a/ydb/library/yql/providers/generic/connector/libcpp/ut_helpers/test_creds.h b/ydb/library/yql/providers/generic/connector/libcpp/ut_helpers/test_creds.h index f3025de07053..cefd4a43c98d 100644 --- a/ydb/library/yql/providers/generic/connector/libcpp/ut_helpers/test_creds.h +++ b/ydb/library/yql/providers/generic/connector/libcpp/ut_helpers/test_creds.h @@ -33,4 +33,4 @@ namespace NYql::NTestCreds { } }; -} //namespace NYql::NTestCreds +} // namespace NYql::NTestCreds diff --git a/ydb/library/yql/providers/generic/provider/yql_generic_settings.h b/ydb/library/yql/providers/generic/provider/yql_generic_settings.h index 07a19c5ce827..6d1f83c7e407 100644 --- a/ydb/library/yql/providers/generic/provider/yql_generic_settings.h +++ b/ydb/library/yql/providers/generic/provider/yql_generic_settings.h @@ -42,4 +42,4 @@ namespace NYql { THashMap ClusterNamesToClusterConfigs; // cluster name -> cluster config THashMap> DatabaseIdsToClusterNames; // database id -> cluster name }; -} //namespace NYql +} // namespace NYql diff --git a/ydb/library/yql/providers/yt/actors/ut/yql_yt_lookup_actor_ut.cpp b/ydb/library/yql/providers/yt/actors/ut/yql_yt_lookup_actor_ut.cpp index 03f77cf2b21c..5edc60ac5547 100644 --- a/ydb/library/yql/providers/yt/actors/ut/yql_yt_lookup_actor_ut.cpp +++ b/ydb/library/yql/providers/yt/actors/ut/yql_yt_lookup_actor_ut.cpp @@ -49,25 +49,34 @@ class TCallLookupActor: public TActorBootstrapped { TCallLookupActor( std::shared_ptr alloc, const NActors::TActorId& lookupActor, - NDq::IDqAsyncLookupSource::TUnboxedValueMap&& request) + std::shared_ptr request) : Alloc(alloc) , LookupActor(lookupActor) - , Request(std::move(request)) + , Request(request) { } void Bootstrap() { - auto ev = new NDq::IDqAsyncLookupSource::TEvLookupRequest(Alloc, std::move(Request)); + auto ev = new NDq::IDqAsyncLookupSource::TEvLookupRequest(Request); TActivationContext::ActorSystem()->Send(new NActors::IEventHandle(LookupActor, SelfId(), ev)); } + ~TCallLookupActor() { + PassAway(); + } + + void PassAway() override { + auto guard = Guard(*Alloc); + Request.reset(); + } + private: static constexpr char ActorName[] = "TEST"; private: std::shared_ptr Alloc; const NActors::TActorId LookupActor; - NDq::IDqAsyncLookupSource::TUnboxedValueMap Request; + std::shared_ptr Request; }; Y_UNIT_TEST(Lookup) { @@ -138,43 +147,43 @@ Y_UNIT_TEST(Lookup) { typeEnv, holderFactory, 1'000'000); - runtime.Register(lookupActor); + auto lookupActorId = runtime.Register(lookupActor); - NDq::IDqAsyncLookupSource::TUnboxedValueMap request{4, keyTypeHelper->GetValueHash(), keyTypeHelper->GetValueEqual()}; - request.emplace(CreateStructValue(holderFactory, {"host1", "vpc1"}), NUdf::TUnboxedValue{}); - request.emplace(CreateStructValue(holderFactory, {"host2", "vpc1"}), NUdf::TUnboxedValue{}); - request.emplace(CreateStructValue(holderFactory, {"host2", "vpc2"}), NUdf::TUnboxedValue{}); //NOT_FOUND expected - request.emplace(CreateStructValue(holderFactory, {"very very long hostname to for test 2", "vpc2"}), NUdf::TUnboxedValue{}); + auto request = std::make_shared(4, keyTypeHelper->GetValueHash(), keyTypeHelper->GetValueEqual()); + request->emplace(CreateStructValue(holderFactory, {"host1", "vpc1"}), NUdf::TUnboxedValue{}); + request->emplace(CreateStructValue(holderFactory, {"host2", "vpc1"}), NUdf::TUnboxedValue{}); + request->emplace(CreateStructValue(holderFactory, {"host2", "vpc2"}), NUdf::TUnboxedValue{}); //NOT_FOUND expected + request->emplace(CreateStructValue(holderFactory, {"very very long hostname to for test 2", "vpc2"}), NUdf::TUnboxedValue{}); guard.Release(); //let actors use alloc - auto callLookupActor = new TCallLookupActor(alloc, lookupActor->SelfId(), std::move(request)); + auto callLookupActor = new TCallLookupActor(alloc, lookupActorId, request); runtime.Register(callLookupActor); auto ev = runtime.GrabEdgeEventRethrow(edge); auto guard2 = Guard(*alloc.get()); - auto lookupResult = std::move(ev->Get()->Result); - UNIT_ASSERT_EQUAL(4, lookupResult.size()); + auto lookupResult = ev->Get()->Result.lock(); + UNIT_ASSERT_EQUAL(4, lookupResult->size()); { - const auto* v = lookupResult.FindPtr(CreateStructValue(holderFactory, {"host1", "vpc1"})); + const auto* v = lookupResult->FindPtr(CreateStructValue(holderFactory, {"host1", "vpc1"})); UNIT_ASSERT(v); UNIT_ASSERT(CheckStructValue(*v, {"host1.vpc1.net", "192.168.1.1"})); } { - const auto* v = lookupResult.FindPtr(CreateStructValue(holderFactory, {"host2", "vpc1"})); + const auto* v = lookupResult->FindPtr(CreateStructValue(holderFactory, {"host2", "vpc1"})); UNIT_ASSERT(v); UNIT_ASSERT(CheckStructValue(*v, {"host2.vpc1.net", "192.168.1.2"})); } { - const auto* v = lookupResult.FindPtr(CreateStructValue(holderFactory, {"host2", "vpc2"})); + const auto* v = lookupResult->FindPtr(CreateStructValue(holderFactory, {"host2", "vpc2"})); UNIT_ASSERT(v); UNIT_ASSERT(!*v); } { - const auto* v = lookupResult.FindPtr(CreateStructValue(holderFactory, {"very very long hostname to for test 2", "vpc2"})); + const auto* v = lookupResult->FindPtr(CreateStructValue(holderFactory, {"very very long hostname to for test 2", "vpc2"})); UNIT_ASSERT(v); UNIT_ASSERT(CheckStructValue(*v, {"very very long fqdn for test 2", "192.168.100.2"})); } } -} //Y_UNIT_TEST_SUITE(GenericProviderLookupActor) \ No newline at end of file +} //Y_UNIT_TEST_SUITE(GenericProviderLookupActor) diff --git a/ydb/library/yql/providers/yt/actors/yql_yt_lookup_actor.cpp b/ydb/library/yql/providers/yt/actors/yql_yt_lookup_actor.cpp index 35298adb6292..a01cfdbc1c8e 100644 --- a/ydb/library/yql/providers/yt/actors/yql_yt_lookup_actor.cpp +++ b/ydb/library/yql/providers/yt/actors/yql_yt_lookup_actor.cpp @@ -87,12 +87,17 @@ class TYtLookupActor { } ~TYtLookupActor() { + Free(); + } + +private: + void Free() { auto guard = Guard(*Alloc); KeyTypeHelper.reset(); TKeyTypeHelper empty; Data = IDqAsyncLookupSource::TUnboxedValueMap{0, empty.GetValueHash(), empty.GetValueEqual()}; } - +public: void Bootstrap() { YQL_CLOG(INFO, ProviderYt) << "New Yt proivider lookup source actor(ActorId=" << SelfId() << ") for" @@ -156,21 +161,30 @@ class TYtLookupActor size_t GetMaxSupportedKeysInRequest() const override { return MaxKeysInRequest; } - void AsyncLookup(IDqAsyncLookupSource::TUnboxedValueMap&& request) override { - YQL_CLOG(DEBUG, ProviderYt) << "ActorId=" << SelfId() << " Got LookupRequest for " << request.size() << " keys"; + void AsyncLookup(std::weak_ptr wrequest) override { Y_ABORT_IF(InProgress); - Y_ABORT_IF(request.size() > MaxKeysInRequest); - InProgress = true; auto guard = Guard(*Alloc); - for (const auto& [k, _]: request) { + auto request = wrequest.lock(); + if (!request) { + YQL_CLOG(DEBUG, ProviderYt) << "ActorId=" << SelfId() << " LookupRequest was lost"; + return; + } + YQL_CLOG(DEBUG, ProviderYt) << "ActorId=" << SelfId() << " Got LookupRequest for " << request->size() << " keys"; + InProgress = true; + Y_ABORT_IF(request->size() > MaxKeysInRequest); + for (auto& [k, val]: *request) { if (const auto* v = Data.FindPtr(k)) { - request[k] = *v; + val = *v; } } - auto ev = new IDqAsyncLookupSource::TEvLookupResult(Alloc, std::move(request)); + auto ev = new IDqAsyncLookupSource::TEvLookupResult(request); TActivationContext::ActorSystem()->Send(new NActors::IEventHandle(ParentId, SelfId(), ev)); InProgress = false; } + void PassAway() override { + Free(); + TBase::PassAway(); + } private: //events STRICT_STFUNC(StateFunc, diff --git a/ydb/tests/fq/generic/test_streaming_join.py b/ydb/tests/fq/generic/test_streaming_join.py index e5018d23b1c6..2ee95ff83130 100644 --- a/ydb/tests/fq/generic/test_streaming_join.py +++ b/ydb/tests/fq/generic/test_streaming_join.py @@ -130,6 +130,7 @@ def freeze(json): ('{"id":9,"user":3}', '{"id":9,"user_id":3,"lookup":"ydb30"}'), ('{"id":2,"user":2}', '{"id":2,"user_id":2,"lookup":"ydb20"}'), ('{"id":1,"user":1}', '{"id":1,"user_id":1,"lookup":"ydb10"}'), + ('{"id":10,"user":null}', '{"id":10,"user_id":null,"lookup":null}'), ('{"id":4,"user":3}', '{"id":4,"user_id":3,"lookup":"ydb30"}'), ('{"id":5,"user":3}', '{"id":5,"user_id":3,"lookup":"ydb30"}'), ('{"id":6,"user":1}', '{"id":6,"user_id":1,"lookup":"ydb10"}'), @@ -349,6 +350,10 @@ def freeze(json): '{"id":3,"za":2,"yb":"1","yc":114,"zd":115}', '{"a":null,"b":null,"c":null,"d":null,"e":null,"f":null,"za":2,"yb":"1","yc":114,"zd":115}', ), + ( + '{"id":3,"za":2,"yb":null,"yc":114,"zd":115}', + '{"a":null,"b":null,"c":null,"d":null,"e":null,"f":null,"za":2,"yb":null,"yc":114,"zd":115}', + ), ] ), ), @@ -390,6 +395,10 @@ def freeze(json): '{"id":3,"za":2,"yb":"1","yc":114,"zd":115}', '{"a":null,"b":null,"c":null,"d":null,"e":null,"f":null,"za":2,"yb":"1","yc":114,"zd":115}', ), + ( + '{"id":3,"za":null,"yb":"1","yc":114,"zd":115}', + '{"a":null,"b":null,"c":null,"d":null,"e":null,"f":null,"za":null,"yb":"1","yc":114,"zd":115}', + ), ] ), ), @@ -506,6 +515,19 @@ def test_streamlookup( messages_ctr = Counter(map(freeze, map(json.loads, map(itemgetter(1), messages)))) assert read_data_ctr == messages_ctr + for node_index in kikimr.compute_plane.kikimr_cluster.nodes: + sensors = kikimr.compute_plane.get_sensors(node_index, "dq_tasks") + for component in ["Lookup", "LookupSrc"]: + componentSensors = sensors.find_sensors( + labels={"operation": query_id, "component": component}, + key_label="sensor", + ) + for k in componentSensors: + print( + f'node[{node_index}].operation[{query_id}].component[{component}].{k} = {componentSensors[k]}', + file=sys.stderr, + ) + fq_client.abort_query(query_id) fq_client.wait_query(query_id) diff --git a/ydb/tests/tools/fq_runner/kikimr_metrics.py b/ydb/tests/tools/fq_runner/kikimr_metrics.py index 006e95119738..08825289171b 100644 --- a/ydb/tests/tools/fq_runner/kikimr_metrics.py +++ b/ydb/tests/tools/fq_runner/kikimr_metrics.py @@ -31,7 +31,7 @@ def find_sensors(self, labels, key_label): continue v = lbls.get(key_label, None) if v is not None: - result[v] = s["value"] + result[v] = s.get("value", None) return result def collect_non_zeros(self): From 6d2ad6ac57e28eab832215b202ded68674f7472e Mon Sep 17 00:00:00 2001 From: Alexey Pozdniakov Date: Wed, 23 Oct 2024 12:23:54 +0300 Subject: [PATCH 45/56] [YQ-3621] support AFTER MATCH SKIP PAST LAST ROW (#10597) (#10739) --- .../yql/core/sql_types/match_recognize.h | 15 ++ .../comp_nodes/mkql_match_recognize.cpp | 179 +++--------------- .../comp_nodes/mkql_match_recognize_nfa.h | 7 +- .../comp_nodes/ut/mkql_match_recognize_ut.cpp | 4 +- .../yql/minikql/mkql_program_builder.cpp | 7 +- .../yql/minikql/mkql_program_builder.h | 3 +- .../yql/minikql/mkql_runtime_version.h | 2 +- .../common/mkql/yql_provider_mkql.cpp | 11 +- ydb/library/yql/sql/v1/match_recognize.cpp | 4 +- ydb/library/yql/sql/v1/match_recognize.h | 21 +- .../yql/sql/v1/sql_match_recognize.cpp | 32 ++-- ydb/library/yql/sql/v1/sql_match_recognize.h | 2 +- .../sql/dq_file/part5/canondata/result.json | 22 +++ .../hybrid_file/part4/canondata/result.json | 14 ++ .../tests/sql/sql2yql/canondata/result.json | 38 ++-- .../after_match_skip_past_last_row.sql | 19 ++ .../match_recognize/alerts-streaming.sql | 1 + .../sql/suites/match_recognize/alerts.sql | 1 + .../match_recognize/alerts_without_order.sql | 1 + .../sql/suites/match_recognize/permute.sql | 1 + .../part5/canondata/result.json | 21 ++ 21 files changed, 199 insertions(+), 206 deletions(-) create mode 100644 ydb/library/yql/tests/sql/suites/match_recognize/after_match_skip_past_last_row.sql diff --git a/ydb/library/yql/core/sql_types/match_recognize.h b/ydb/library/yql/core/sql_types/match_recognize.h index e30e11eb2131..0c6105ad9413 100644 --- a/ydb/library/yql/core/sql_types/match_recognize.h +++ b/ydb/library/yql/core/sql_types/match_recognize.h @@ -8,6 +8,21 @@ namespace NYql::NMatchRecognize { +enum class EAfterMatchSkipTo { + NextRow, + PastLastRow, + ToFirst, + ToLast, + To +}; + +struct TAfterMatchSkipTo { + EAfterMatchSkipTo To; + TString Var; + + [[nodiscard]] bool operator==(const TAfterMatchSkipTo&) const noexcept = default; +}; + constexpr size_t MaxPatternNesting = 20; //Limit recursion for patterns constexpr size_t MaxPermutedItems = 6; diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp index 7530cf06df25..772bd6cbd0f7 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize.cpp @@ -39,131 +39,13 @@ struct TMatchRecognizeProcessorParameters { TMeasureInputColumnOrder MeasureInputColumnOrder; TComputationNodePtrVector Measures; TOutputColumnOrder OutputColumnOrder; -}; - -class TBackTrackingMatchRecognize { - using TPartitionList = TSimpleList; - using TRange = TPartitionList::TRange; - using TMatchedVars = TMatchedVars; -public: - //TODO(YQL-16486): create a tree for backtracking(replace var names with indexes) - - struct TPatternConfiguration { - void Save(TMrOutputSerializer& /*serializer*/) const { - } - - void Load(TMrInputSerializer& /*serializer*/) { - } - - friend bool operator==(const TPatternConfiguration&, const TPatternConfiguration&) { - return true; - } - }; - - struct TPatternConfigurationBuilder { - using TPatternConfigurationPtr = std::shared_ptr; - static TPatternConfigurationPtr Create(const TRowPattern& pattern, const THashMap& varNameToIndex) { - Y_UNUSED(pattern); - Y_UNUSED(varNameToIndex); - return std::make_shared(); - } - }; - - TBackTrackingMatchRecognize( - NUdf::TUnboxedValue&& partitionKey, - const TMatchRecognizeProcessorParameters& parameters, - const TPatternConfigurationBuilder::TPatternConfigurationPtr pattern, - const TContainerCacheOnContext& cache - ) - : PartitionKey(std::move(partitionKey)) - , Parameters(parameters) - , Cache(cache) - , CurMatchedVars(parameters.Defines.size()) - , MatchNumber(0) - { - //TODO(YQL-16486) - Y_UNUSED(pattern); - } - - bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { - Y_UNUSED(ctx); - Rows.Append(std::move(row)); - return false; - } - NUdf::TUnboxedValue GetOutputIfReady(TComputationContext& ctx) { - if (Matches.empty()) - return NUdf::TUnboxedValue{}; - Parameters.MatchedVarsArg->SetValue(ctx, ToValue(ctx.HolderFactory, std::move(Matches.front()))); - Matches.pop_front(); - Parameters.MeasureInputDataArg->SetValue(ctx, ctx.HolderFactory.Create( - Parameters.InputDataArg->GetValue(ctx), - Parameters.MeasureInputColumnOrder, - Parameters.MatchedVarsArg->GetValue(ctx), - Parameters.VarNames, - ++MatchNumber - )); - NUdf::TUnboxedValue *itemsPtr = nullptr; - const auto result = Cache.NewArray(ctx, Parameters.OutputColumnOrder.size(), itemsPtr); - for (auto const& c: Parameters.OutputColumnOrder) { - switch(c.first) { - case EOutputColumnSource::Measure: - *itemsPtr++ = Parameters.Measures[c.second]->GetValue(ctx); - break; - case EOutputColumnSource::PartitionKey: - *itemsPtr++ = PartitionKey.GetElement(c.second); - break; - } - } - return result; - } - bool ProcessEndOfData(TComputationContext& ctx) { - //Assume, that data moved to IComputationExternalNode node, will not be modified or released - //till the end of the current function - auto rowsSize = Rows.Size(); - Parameters.InputDataArg->SetValue(ctx, ctx.HolderFactory.Create>(Rows)); - for (size_t i = 0; i != rowsSize; ++i) { - Parameters.CurrentRowIndexArg->SetValue(ctx, NUdf::TUnboxedValuePod(static_cast(i))); - for (size_t v = 0; v != Parameters.Defines.size(); ++v) { - const auto &d = Parameters.Defines[v]->GetValue(ctx); - if (d && d.GetOptionalValue().Get()) { - Extend(CurMatchedVars[v], TRange{i}); - } - } - //for the sake of dummy usage assume non-overlapped matches at every 5th row of any partition - if (i % 5 == 0) { - TMatchedVars temp; - temp.swap(CurMatchedVars); - Matches.emplace_back(std::move(temp)); - CurMatchedVars.resize(Parameters.Defines.size()); - } - } - return not Matches.empty(); - } - - void Save(TOutputSerializer& /*serializer*/) const { - // Not used in not streaming mode. - } - - void Load(TMrInputSerializer& /*serializer*/) { - // Not used in not streaming mode. - } - -private: - const NUdf::TUnboxedValue PartitionKey; - const TMatchRecognizeProcessorParameters& Parameters; - const TContainerCacheOnContext& Cache; - TSimpleList Rows; - TMatchedVars CurMatchedVars; - std::deque> Matches; - ui64 MatchNumber; + TAfterMatchSkipTo SkipTo; }; class TStreamingMatchRecognize { using TPartitionList = TSparseList; using TRange = TPartitionList::TRange; public: - using TPatternConfiguration = TNfaTransitionGraph; - using TPatternConfigurationBuilder = TNfaTransitionGraphBuilder; TStreamingMatchRecognize( NUdf::TUnboxedValue&& partitionKey, const TMatchRecognizeProcessorParameters& parameters, @@ -213,6 +95,9 @@ class TStreamingMatchRecognize { break; } } + if (EAfterMatchSkipTo::PastLastRow == Parameters.SkipTo.To) { + Nfa.Clear(); + } return result; } bool ProcessEndOfData(TComputationContext& ctx) { @@ -243,11 +128,9 @@ class TStreamingMatchRecognize { ui64 MatchNumber = 0; }; -template class TStateForNonInterleavedPartitions - : public TComputationValue> + : public TComputationValue { - using TRowPatternConfigurationBuilder = typename Algo::TPatternConfigurationBuilder; public: TStateForNonInterleavedPartitions( TMemoryUsageInfo* memInfo, @@ -265,7 +148,7 @@ class TStateForNonInterleavedPartitions , PartitionKey(partitionKey) , PartitionKeyPacker(true, partitionKeyType) , Parameters(parameters) - , RowPatternConfiguration(TRowPatternConfigurationBuilder::Create(parameters.Pattern, parameters.VarNamesLookup)) + , RowPatternConfiguration(TNfaTransitionGraphBuilder::Create(parameters.Pattern, parameters.VarNamesLookup)) , Cache(cache) , Terminating(false) , SerializerContext(ctx, rowType, rowPacker) @@ -301,7 +184,7 @@ class TStateForNonInterleavedPartitions bool validPartitionHandler = in.Read(); if (validPartitionHandler) { NUdf::TUnboxedValue key = PartitionKeyPacker.Unpack(CurPartitionPackedKey, SerializerContext.Ctx.HolderFactory); - PartitionHandler.reset(new Algo( + PartitionHandler.reset(new TStreamingMatchRecognize( std::move(key), Parameters, RowPatternConfiguration, @@ -313,7 +196,7 @@ class TStateForNonInterleavedPartitions if (validDelayedRow) { in(DelayedRow); } - auto restoredRowPatternConfiguration = std::make_shared(); + auto restoredRowPatternConfiguration = std::make_shared(); restoredRowPatternConfiguration->Load(in); MKQL_ENSURE(*restoredRowPatternConfiguration == *RowPatternConfiguration, "Restored and current RowPatternConfiguration is different"); MKQL_ENSURE(in.Empty(), "State is corrupted"); @@ -367,12 +250,11 @@ class TStateForNonInterleavedPartitions InputRowArg->SetValue(ctx, NUdf::TUnboxedValue(temp)); auto partitionKey = PartitionKey->GetValue(ctx); CurPartitionPackedKey = PartitionKeyPacker.Pack(partitionKey); - PartitionHandler.reset(new Algo( + PartitionHandler.reset(new TStreamingMatchRecognize( std::move(partitionKey), Parameters, RowPatternConfiguration, - Cache - )); + Cache)); PartitionHandler->ProcessInputRow(std::move(temp), ctx); } if (Terminating) { @@ -382,12 +264,12 @@ class TStateForNonInterleavedPartitions } private: TString CurPartitionPackedKey; - std::unique_ptr PartitionHandler; + std::unique_ptr PartitionHandler; IComputationExternalNode* InputRowArg; IComputationNode* PartitionKey; TValuePackerGeneric PartitionKeyPacker; const TMatchRecognizeProcessorParameters& Parameters; - const typename TRowPatternConfigurationBuilder::TPatternConfigurationPtr RowPatternConfiguration; + const TNfaTransitionGraph::TPtr RowPatternConfiguration; const TContainerCacheOnContext& Cache; NUdf::TUnboxedValue DelayedRow; bool Terminating; @@ -768,6 +650,11 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation defines.push_back(callable.GetInput(inputIndex++)); } const auto& streamingMode = callable.GetInput(inputIndex++); + NYql::NMatchRecognize::TAfterMatchSkipTo skipTo = {NYql::NMatchRecognize::EAfterMatchSkipTo::NextRow, ""}; + if (inputIndex + 2 <= callable.GetInputsCount()) { + skipTo.To = static_cast(AS_VALUE(TDataLiteral, callable.GetInput(inputIndex++))->AsValue().Get()); + skipTo.Var = AS_VALUE(TDataLiteral, callable.GetInput(inputIndex++))->AsValue().AsStringRef(); + } MKQL_ENSURE(callable.GetInputsCount() == inputIndex, "Wrong input count"); const auto& [vars, varsLookup] = ConvertListOfStrings(varNames); @@ -788,6 +675,7 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation ) , ConvertVectorOfCallables(measures, ctx) , GetOutputColumnOrder(partitionColumnIndexes, measureColumnIndexes) + , skipTo }; if (AS_VALUE(TDataLiteral, streamingMode)->AsValue().Get()) { return new TMatchRecognizeWrapper(ctx.Mutables @@ -800,28 +688,15 @@ IComputationNode* WrapMatchRecognizeCore(TCallable& callable, const TComputation , rowType ); } else { - const bool useNfaForTables = true; //TODO(YQL-16486) get this flag from an optimizer - if (useNfaForTables) { - return new TMatchRecognizeWrapper>(ctx.Mutables - , GetValueRepresentation(inputFlow.GetStaticType()) - , LocateNode(ctx.NodeLocator, *inputFlow.GetNode()) - , static_cast(LocateNode(ctx.NodeLocator, *inputRowArg.GetNode())) - , LocateNode(ctx.NodeLocator, *partitionKeySelector.GetNode()) - , partitionKeySelector.GetStaticType() - , std::move(parameters) - , rowType - ); - } else { - return new TMatchRecognizeWrapper>(ctx.Mutables - , GetValueRepresentation(inputFlow.GetStaticType()) - , LocateNode(ctx.NodeLocator, *inputFlow.GetNode()) - , static_cast(LocateNode(ctx.NodeLocator, *inputRowArg.GetNode())) - , LocateNode(ctx.NodeLocator, *partitionKeySelector.GetNode()) - , partitionKeySelector.GetStaticType() - , std::move(parameters) - , rowType - ); - } + return new TMatchRecognizeWrapper(ctx.Mutables + , GetValueRepresentation(inputFlow.GetStaticType()) + , LocateNode(ctx.NodeLocator, *inputFlow.GetNode()) + , static_cast(LocateNode(ctx.NodeLocator, *inputRowArg.GetNode())) + , LocateNode(ctx.NodeLocator, *partitionKeySelector.GetNode()) + , partitionKeySelector.GetStaticType() + , std::move(parameters) + , rowType + ); } } diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h index 77df6f1f66f5..398047e33c43 100644 --- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h +++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h @@ -283,8 +283,7 @@ class TNfaTransitionGraphBuilder { return {input, output}; } public: - using TPatternConfigurationPtr = TNfaTransitionGraph::TPtr; - static TPatternConfigurationPtr Create(const TRowPattern& pattern, const THashMap& varNameToIndex) { + static TNfaTransitionGraph::TPtr Create(const TRowPattern& pattern, const THashMap& varNameToIndex) { auto result = std::make_shared(); TNfaTransitionGraphBuilder builder(result); auto item = builder.BuildTerms(pattern, varNameToIndex); @@ -455,6 +454,10 @@ class TNfa { serializer.Read(EpsilonTransitionsLastRow); } + void Clear() { + ActiveStates.clear(); + } + private: //TODO (zverevgeny): Consider to change to std::vector for the sake of perf using TStateSet = std::set, TMKQLAllocator>; diff --git a/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp index 08a675535d08..387762e9bd5e 100644 --- a/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp +++ b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_ut.cpp @@ -115,7 +115,9 @@ namespace NKikimr { {NYql::NMatchRecognize::TRowPatternFactor{"A", 3, 3, false, false, false}} }, getDefines, - streamingMode); + streamingMode, + {NYql::NMatchRecognize::EAfterMatchSkipTo::NextRow, ""} + ); auto graph = setup.BuildGraph(pgmReturn); return graph; diff --git a/ydb/library/yql/minikql/mkql_program_builder.cpp b/ydb/library/yql/minikql/mkql_program_builder.cpp index 2272ae640b3e..dccdf7ad5ff7 100644 --- a/ydb/library/yql/minikql/mkql_program_builder.cpp +++ b/ydb/library/yql/minikql/mkql_program_builder.cpp @@ -5902,7 +5902,8 @@ TRuntimeNode TProgramBuilder::MatchRecognizeCore( const TArrayRef>& getMeasures, const NYql::NMatchRecognize::TRowPattern& pattern, const TArrayRef>& getDefines, - bool streamingMode + bool streamingMode, + const NYql::NMatchRecognize::TAfterMatchSkipTo& skipTo ) { MKQL_ENSURE(RuntimeVersion >= 42, "MatchRecognize is not supported in runtime version " << RuntimeVersion); @@ -6056,6 +6057,10 @@ TRuntimeNode TProgramBuilder::MatchRecognizeCore( callableBuilder.Add(d); } callableBuilder.Add(NewDataLiteral(streamingMode)); + if (RuntimeVersion >= 52U) { + callableBuilder.Add(NewDataLiteral(static_cast(skipTo.To))); + callableBuilder.Add(NewDataLiteral(skipTo.Var)); + } return TRuntimeNode(callableBuilder.Build(), false); } diff --git a/ydb/library/yql/minikql/mkql_program_builder.h b/ydb/library/yql/minikql/mkql_program_builder.h index 97daf163c586..34fe726a4f83 100644 --- a/ydb/library/yql/minikql/mkql_program_builder.h +++ b/ydb/library/yql/minikql/mkql_program_builder.h @@ -696,7 +696,8 @@ class TProgramBuilder : public TTypeBuilder { const TArrayRef>& getMeasures, const NYql::NMatchRecognize::TRowPattern& pattern, const TArrayRef>& getDefines, - bool streamingMode + bool streamingMode, + const NYql::NMatchRecognize::TAfterMatchSkipTo& skipTo ); TRuntimeNode TimeOrderRecover( diff --git a/ydb/library/yql/minikql/mkql_runtime_version.h b/ydb/library/yql/minikql/mkql_runtime_version.h index 22072157e87f..bfd26216ab87 100644 --- a/ydb/library/yql/minikql/mkql_runtime_version.h +++ b/ydb/library/yql/minikql/mkql_runtime_version.h @@ -24,7 +24,7 @@ namespace NMiniKQL { // 1. Bump this version every time incompatible runtime nodes are introduced. // 2. Make sure you provide runtime node generation for previous runtime versions. #ifndef MKQL_RUNTIME_VERSION -#define MKQL_RUNTIME_VERSION 50U +#define MKQL_RUNTIME_VERSION 52U #endif // History: diff --git a/ydb/library/yql/providers/common/mkql/yql_provider_mkql.cpp b/ydb/library/yql/providers/common/mkql/yql_provider_mkql.cpp index f72f6f32e1bb..39c3383607f5 100644 --- a/ydb/library/yql/providers/common/mkql/yql_provider_mkql.cpp +++ b/ydb/library/yql/providers/common/mkql/yql_provider_mkql.cpp @@ -19,6 +19,7 @@ #include #include +#include #include @@ -874,6 +875,7 @@ TMkqlCommonCallableCompiler::TShared::TShared() { //explore params const auto& measures = params->ChildRef(0); + const auto& skipTo = params->ChildRef(2); const auto& pattern = params->ChildRef(3); const auto& defines = params->ChildRef(4); @@ -916,6 +918,12 @@ TMkqlCommonCallableCompiler::TShared::TShared() { }; } + auto stringTo = skipTo->Child(0)->Content(); + auto var = skipTo->Child(1)->Content(); + MKQL_ENSURE(stringTo.SkipPrefix("AfterMatchSkip_"), R"(MATCH_RECOGNIZE: should start with "AfterMatchSkip_")"); + NYql::NMatchRecognize::EAfterMatchSkipTo to; + MKQL_ENSURE(TryFromString(stringTo, to), "MATCH_RECOGNIZE: cannot parse AfterMatchSkipTo mode"); + const auto streamingMode = FromString(settings->Child(0)->Child(1)->Content()); return ctx.ProgramBuilder.MatchRecognizeCore( @@ -925,7 +933,8 @@ TMkqlCommonCallableCompiler::TShared::TShared() { getMeasures, NYql::NMatchRecognize::ConvertPattern(pattern, ctx.ExprCtx), getDefines, - streamingMode + streamingMode, + NYql::NMatchRecognize::TAfterMatchSkipTo{to, TString{var}} ); }); diff --git a/ydb/library/yql/sql/v1/match_recognize.cpp b/ydb/library/yql/sql/v1/match_recognize.cpp index 284654b097f8..47055e2f3d7b 100644 --- a/ydb/library/yql/sql/v1/match_recognize.cpp +++ b/ydb/library/yql/sql/v1/match_recognize.cpp @@ -22,7 +22,7 @@ class TMatchRecognize: public TAstListNode { std::pair>&& sortSpecs, std::pair>&& measures, std::pair&& rowsPerMatch, - std::pair&& skipTo, + std::pair&& skipTo, std::pair&& pattern, std::pair&& subset, std::pair>&& definitions @@ -57,7 +57,7 @@ class TMatchRecognize: public TAstListNode { std::pair>&& sortSpecs, std::pair>&& measures, std::pair&& rowsPerMatch, - std::pair&& skipTo, + std::pair&& skipTo, std::pair&& pattern, std::pair&& subset, std::pair>&& definitions diff --git a/ydb/library/yql/sql/v1/match_recognize.h b/ydb/library/yql/sql/v1/match_recognize.h index 5b64de823193..7945e86e5b8e 100644 --- a/ydb/library/yql/sql/v1/match_recognize.h +++ b/ydb/library/yql/sql/v1/match_recognize.h @@ -15,23 +15,6 @@ enum class ERowsPerMatch { AllRows }; -enum class EAfterMatchSkipTo { - NextRow, - PastLastRow, - ToFirst, - ToLast, - To -}; - -struct TAfterMatchSkipTo { - TAfterMatchSkipTo(EAfterMatchSkipTo to, const TStringBuf var = TStringBuf()) - : To(to) - , Var(var) - {} - EAfterMatchSkipTo To; - TString Var; -}; - class TMatchRecognizeBuilder: public TSimpleRefCount { public: TMatchRecognizeBuilder( @@ -40,7 +23,7 @@ class TMatchRecognizeBuilder: public TSimpleRefCount { std::pair>&& sortSpecs, std::pair>&& measures, std::pair&& rowsPerMatch, - std::pair&& skipTo, + std::pair&& skipTo, std::pair&& pattern, std::pair&& subset, std::pair>&& definitions @@ -63,7 +46,7 @@ class TMatchRecognizeBuilder: public TSimpleRefCount { std::pair> SortSpecs; std::pair> Measures; std::pair RowsPerMatch; - std::pair SkipTo; + std::pair SkipTo; std::pair Pattern; std::pair Subset; std::pair> Definitions; diff --git a/ydb/library/yql/sql/v1/sql_match_recognize.cpp b/ydb/library/yql/sql/v1/sql_match_recognize.cpp index c168f60bd494..5a2ff31c5fa0 100644 --- a/ydb/library/yql/sql/v1/sql_match_recognize.cpp +++ b/ydb/library/yql/sql/v1/sql_match_recognize.cpp @@ -78,13 +78,19 @@ TMatchRecognizeBuilderPtr TSqlMatchRecognizeClause::CreateBuilder(const NSQLv1Ge //this block is located before pattern block in grammar, // but depends on it, so it is processed after pattern block - std::pair skipTo { pos, TAfterMatchSkipTo{EAfterMatchSkipTo::NextRow, TString()} }; + std::pair skipTo { + pos, + NYql::NMatchRecognize::TAfterMatchSkipTo{ + NYql::NMatchRecognize::EAfterMatchSkipTo::PastLastRow, + TString() + } + }; if (commonSyntax.HasBlock1()){ skipTo = ParseAfterMatchSkipTo(commonSyntax.GetBlock1().GetRule_row_pattern_skip_to3()); const auto varRequired = - EAfterMatchSkipTo::ToFirst == skipTo.second.To || - EAfterMatchSkipTo::ToLast == skipTo.second.To || - EAfterMatchSkipTo::To == skipTo.second.To; + NYql::NMatchRecognize::EAfterMatchSkipTo::ToFirst == skipTo.second.To || + NYql::NMatchRecognize::EAfterMatchSkipTo::ToLast == skipTo.second.To || + NYql::NMatchRecognize::EAfterMatchSkipTo::To == skipTo.second.To; if (varRequired) { const auto& allVars = NYql::NMatchRecognize::GetPatternVars(pattern); if (allVars.find(skipTo.second.Var) == allVars.cend()) { @@ -186,39 +192,39 @@ std::pair TSqlMatchRecognizeClause::ParseRowsPerMatch( } } -std::pair TSqlMatchRecognizeClause::ParseAfterMatchSkipTo(const TRule_row_pattern_skip_to& skipToClause) { +std::pair TSqlMatchRecognizeClause::ParseAfterMatchSkipTo(const TRule_row_pattern_skip_to& skipToClause) { switch (skipToClause.GetAltCase()) { case TRule_row_pattern_skip_to::kAltRowPatternSkipTo1: return std::pair{ TokenPosition(skipToClause.GetAlt_row_pattern_skip_to1().GetToken1()), - TAfterMatchSkipTo{EAfterMatchSkipTo::NextRow} + NYql::NMatchRecognize::TAfterMatchSkipTo{NYql::NMatchRecognize::EAfterMatchSkipTo::NextRow, ""} }; case TRule_row_pattern_skip_to::kAltRowPatternSkipTo2: return std::pair{ TokenPosition(skipToClause.GetAlt_row_pattern_skip_to2().GetToken1()), - TAfterMatchSkipTo{EAfterMatchSkipTo::PastLastRow} + NYql::NMatchRecognize::TAfterMatchSkipTo{NYql::NMatchRecognize::EAfterMatchSkipTo::PastLastRow, ""} }; case TRule_row_pattern_skip_to::kAltRowPatternSkipTo3: return std::pair{ TokenPosition(skipToClause.GetAlt_row_pattern_skip_to3().GetToken1()), - TAfterMatchSkipTo{ - EAfterMatchSkipTo::ToFirst, + NYql::NMatchRecognize::TAfterMatchSkipTo{ + NYql::NMatchRecognize::EAfterMatchSkipTo::ToFirst, skipToClause.GetAlt_row_pattern_skip_to3().GetRule_row_pattern_skip_to_variable_name4().GetRule_row_pattern_variable_name1().GetRule_identifier1().GetToken1().GetValue() } }; case TRule_row_pattern_skip_to::kAltRowPatternSkipTo4: return std::pair{ TokenPosition(skipToClause.GetAlt_row_pattern_skip_to4().GetToken1()), - TAfterMatchSkipTo{ - EAfterMatchSkipTo::ToLast, + NYql::NMatchRecognize::TAfterMatchSkipTo{ + NYql::NMatchRecognize::EAfterMatchSkipTo::ToLast, skipToClause.GetAlt_row_pattern_skip_to4().GetRule_row_pattern_skip_to_variable_name4().GetRule_row_pattern_variable_name1().GetRule_identifier1().GetToken1().GetValue() } }; case TRule_row_pattern_skip_to::kAltRowPatternSkipTo5: return std::pair{ TokenPosition(skipToClause.GetAlt_row_pattern_skip_to5().GetToken1()), - TAfterMatchSkipTo{ - EAfterMatchSkipTo::To, + NYql::NMatchRecognize::TAfterMatchSkipTo{ + NYql::NMatchRecognize::EAfterMatchSkipTo::To, skipToClause.GetAlt_row_pattern_skip_to5().GetRule_row_pattern_skip_to_variable_name3().GetRule_row_pattern_variable_name1().GetRule_identifier1().GetToken1().GetValue() } }; diff --git a/ydb/library/yql/sql/v1/sql_match_recognize.h b/ydb/library/yql/sql/v1/sql_match_recognize.h index d8d618920aa4..6766acc95375 100644 --- a/ydb/library/yql/sql/v1/sql_match_recognize.h +++ b/ydb/library/yql/sql/v1/sql_match_recognize.h @@ -18,7 +18,7 @@ class TSqlMatchRecognizeClause: public TSqlTranslation { TNamedFunction ParseOneMeasure(const TRule_row_pattern_measure_definition& node); TVector ParseMeasures(const TRule_row_pattern_measure_list& node); std::pair ParseRowsPerMatch(const TRule_row_pattern_rows_per_match& rowsPerMatchClause); - std::pair ParseAfterMatchSkipTo(const TRule_row_pattern_skip_to& skipToClause); + std::pair ParseAfterMatchSkipTo(const TRule_row_pattern_skip_to& skipToClause); NYql::NMatchRecognize::TRowPatternTerm ParsePatternTerm(const TRule_row_pattern_term& node); NYql::NMatchRecognize::TRowPattern ParsePattern(const TRule_row_pattern& node); TNamedFunction ParseOneDefinition(const TRule_row_pattern_definition& node); diff --git a/ydb/library/yql/tests/sql/dq_file/part5/canondata/result.json b/ydb/library/yql/tests/sql/dq_file/part5/canondata/result.json index c9bce0e6d28c..5d4c70abf59d 100644 --- a/ydb/library/yql/tests/sql/dq_file/part5/canondata/result.json +++ b/ydb/library/yql/tests/sql/dq_file/part5/canondata/result.json @@ -2039,6 +2039,28 @@ } ], "test.test[key_filter-utf8_with_legacy--Results]": [], + "test.test[match_recognize-after_match_skip_past_last_row-default.txt-Analyze]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1031349/a955c852651ea9f8124bef13bd770d8d15af6c2e/resource.tar.gz#test.test_match_recognize-after_match_skip_past_last_row-default.txt-Analyze_/plan.txt" + } + ], + "test.test[match_recognize-after_match_skip_past_last_row-default.txt-Debug]": [ + { + "checksum": "7911a3e7570665753b1e25827635db15", + "size": 1317, + "uri": "https://{canondata_backend}/1031349/a955c852651ea9f8124bef13bd770d8d15af6c2e/resource.tar.gz#test.test_match_recognize-after_match_skip_past_last_row-default.txt-Debug_/opt.yql_patched" + } + ], + "test.test[match_recognize-after_match_skip_past_last_row-default.txt-Plan]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1031349/a955c852651ea9f8124bef13bd770d8d15af6c2e/resource.tar.gz#test.test_match_recognize-after_match_skip_past_last_row-default.txt-Plan_/plan.txt" + } + ], + "test.test[match_recognize-after_match_skip_past_last_row-default.txt-Results]": [], "test.test[optimizers-test_lmap_opts--Analyze]": [ { "checksum": "a019f0e33bc55441f2581dc8345a6b9e", diff --git a/ydb/library/yql/tests/sql/hybrid_file/part4/canondata/result.json b/ydb/library/yql/tests/sql/hybrid_file/part4/canondata/result.json index ffa1661c069e..c810d141dc3f 100644 --- a/ydb/library/yql/tests/sql/hybrid_file/part4/canondata/result.json +++ b/ydb/library/yql/tests/sql/hybrid_file/part4/canondata/result.json @@ -1665,6 +1665,20 @@ "uri": "https://{canondata_backend}/1931696/8382830b676a61af36d1344910d51cd1bf39f3ef/resource.tar.gz#test.test_limit-empty_sort_calc_after_limit-default.txt-Plan_/plan.txt" } ], + "test.test[match_recognize-after_match_skip_past_last_row-default.txt-Debug]": [ + { + "checksum": "e6a51f8c3ed77a2c4dfdf2e55ec4517d", + "size": 1316, + "uri": "https://{canondata_backend}/1880306/5213fbc312a45950f1152a68258af55d6e4976a2/resource.tar.gz#test.test_match_recognize-after_match_skip_past_last_row-default.txt-Debug_/opt.yql_patched" + } + ], + "test.test[match_recognize-after_match_skip_past_last_row-default.txt-Plan]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1880306/5213fbc312a45950f1152a68258af55d6e4976a2/resource.tar.gz#test.test_match_recognize-after_match_skip_past_last_row-default.txt-Plan_/plan.txt" + } + ], "test.test[optimizers-nonselected_direct_row--Debug]": [ { "checksum": "8b5d70cb31c105309d443a31d1188534", diff --git a/ydb/library/yql/tests/sql/sql2yql/canondata/result.json b/ydb/library/yql/tests/sql/sql2yql/canondata/result.json index 4b4262ce670a..47fe933e5280 100644 --- a/ydb/library/yql/tests/sql/sql2yql/canondata/result.json +++ b/ydb/library/yql/tests/sql/sql2yql/canondata/result.json @@ -10758,6 +10758,13 @@ "uri": "https://{canondata_backend}/1599023/66d0b07d601bb15f0e0b65bb7b4d493f89c1c283/resource.tar.gz#test_sql2yql.test_lineage-with_inline_/sql.yql" } ], + "test_sql2yql.test[match_recognize-after_match_skip_past_last_row]": [ + { + "checksum": "bb84286a97914c6cfd2e47288a49335e", + "size": 3122, + "uri": "https://{canondata_backend}/1130705/ab8dea65d8ef4022fe05bde8ae56bb987e245f04/resource.tar.gz#test_sql2yql.test_match_recognize-after_match_skip_past_last_row_/sql.yql" + } + ], "test_sql2yql.test[match_recognize-alerts-streaming]": [ { "checksum": "608ebe5a00413e80b8e74157944f0b65", @@ -30008,32 +30015,39 @@ "uri": "https://{canondata_backend}/1599023/66d0b07d601bb15f0e0b65bb7b4d493f89c1c283/resource.tar.gz#test_sql_format.test_lineage-with_inline_/formatted.sql" } ], + "test_sql_format.test[match_recognize-after_match_skip_past_last_row]": [ + { + "checksum": "100b9f9651315a432e18868c21b776d6", + "size": 412, + "uri": "https://{canondata_backend}/1130705/ab8dea65d8ef4022fe05bde8ae56bb987e245f04/resource.tar.gz#test_sql_format.test_match_recognize-after_match_skip_past_last_row_/formatted.sql" + } + ], "test_sql_format.test[match_recognize-alerts-streaming]": [ { - "checksum": "b8aa97680d42faf26e093c2a3ccb05f1", - "size": 2939, - "uri": "https://{canondata_backend}/1937001/da4215d5087e56eec0224ec5e7754dafd0b2bdcf/resource.tar.gz#test_sql_format.test_match_recognize-alerts-streaming_/formatted.sql" + "checksum": "d3a3fd90c8a6a758f0067dd66566d37a", + "size": 2968, + "uri": "https://{canondata_backend}/1130705/ab8dea65d8ef4022fe05bde8ae56bb987e245f04/resource.tar.gz#test_sql_format.test_match_recognize-alerts-streaming_/formatted.sql" } ], "test_sql_format.test[match_recognize-alerts]": [ { - "checksum": "585357811c1f0240f4c3207baf8d66f3", - "size": 2941, - "uri": "https://{canondata_backend}/1937001/da4215d5087e56eec0224ec5e7754dafd0b2bdcf/resource.tar.gz#test_sql_format.test_match_recognize-alerts_/formatted.sql" + "checksum": "26acb44218b8f1112df875867fe530ef", + "size": 2970, + "uri": "https://{canondata_backend}/1130705/ab8dea65d8ef4022fe05bde8ae56bb987e245f04/resource.tar.gz#test_sql_format.test_match_recognize-alerts_/formatted.sql" } ], "test_sql_format.test[match_recognize-alerts_without_order]": [ { - "checksum": "779c2c3a4eab619646509ce5008863e8", - "size": 2906, - "uri": "https://{canondata_backend}/1937001/f1ec239726ab3e2cf00695f3d10461ff9ef6c3b0/resource.tar.gz#test_sql_format.test_match_recognize-alerts_without_order_/formatted.sql" + "checksum": "0e6e55207b31bb4597a16821c0b3ac34", + "size": 2935, + "uri": "https://{canondata_backend}/1130705/ab8dea65d8ef4022fe05bde8ae56bb987e245f04/resource.tar.gz#test_sql_format.test_match_recognize-alerts_without_order_/formatted.sql" } ], "test_sql_format.test[match_recognize-permute]": [ { - "checksum": "998e6752ce413cc78e952b9958dfab74", - "size": 721, - "uri": "https://{canondata_backend}/1600758/90e7657ff4d9210d12f860921bc22e4e3c794cc5/resource.tar.gz#test_sql_format.test_match_recognize-permute_/formatted.sql" + "checksum": "97960de85a125f078b142f62ebfe938e", + "size": 750, + "uri": "https://{canondata_backend}/1130705/ab8dea65d8ef4022fe05bde8ae56bb987e245f04/resource.tar.gz#test_sql_format.test_match_recognize-permute_/formatted.sql" } ], "test_sql_format.test[match_recognize-simple_paritioning-streaming]": [ diff --git a/ydb/library/yql/tests/sql/suites/match_recognize/after_match_skip_past_last_row.sql b/ydb/library/yql/tests/sql/suites/match_recognize/after_match_skip_past_last_row.sql new file mode 100644 index 000000000000..79d9c7622000 --- /dev/null +++ b/ydb/library/yql/tests/sql/suites/match_recognize/after_match_skip_past_last_row.sql @@ -0,0 +1,19 @@ +pragma FeatureR010="prototype"; +pragma config.flags("MatchRecognizeStream", "disable"); + +$input = SELECT * FROM AS_TABLE([ + <|time:0|>, + <|time:1|>, + <|time:2|>, + <|time:3|>, +]); + +SELECT * FROM $input MATCH_RECOGNIZE( + ORDER BY CAST(time as Timestamp) + MEASURES + FIRST(X.time) as first_time, + LAST(X.time) as last_time + PATTERN (X{2}) + DEFINE + X as True +); diff --git a/ydb/library/yql/tests/sql/suites/match_recognize/alerts-streaming.sql b/ydb/library/yql/tests/sql/suites/match_recognize/alerts-streaming.sql index 5a282ca647ac..efa5bef17ae4 100644 --- a/ydb/library/yql/tests/sql/suites/match_recognize/alerts-streaming.sql +++ b/ydb/library/yql/tests/sql/suites/match_recognize/alerts-streaming.sql @@ -29,6 +29,7 @@ FROM AS_TABLE($osquery_data) MATCH_RECOGNIZE( LAST(LOGIN_SUCCESS_SAME_USER.user) as brutforce_login ONE ROW PER MATCH + AFTER MATCH SKIP TO NEXT ROW PATTERN ( LOGIN_SUCCESS_REMOTE ANY_ROW* (SUSPICIOUS_ACTION_SOON | SUSPICIOUS_ACTION_TIMEOUT) | (LOGIN_FAILED_SAME_USER ANY_ROW*){2,} LOGIN_SUCCESS_SAME_USER diff --git a/ydb/library/yql/tests/sql/suites/match_recognize/alerts.sql b/ydb/library/yql/tests/sql/suites/match_recognize/alerts.sql index dc5d70ddbd76..65aac91efdf3 100644 --- a/ydb/library/yql/tests/sql/suites/match_recognize/alerts.sql +++ b/ydb/library/yql/tests/sql/suites/match_recognize/alerts.sql @@ -29,6 +29,7 @@ FROM AS_TABLE($osquery_data) MATCH_RECOGNIZE( LAST(LOGIN_SUCCESS_SAME_USER.user) as brutforce_login ONE ROW PER MATCH + AFTER MATCH SKIP TO NEXT ROW PATTERN ( LOGIN_SUCCESS_REMOTE ANY_ROW* (SUSPICIOUS_ACTION_SOON | SUSPICIOUS_ACTION_TIMEOUT) | (LOGIN_FAILED_SAME_USER ANY_ROW*){2,} LOGIN_SUCCESS_SAME_USER diff --git a/ydb/library/yql/tests/sql/suites/match_recognize/alerts_without_order.sql b/ydb/library/yql/tests/sql/suites/match_recognize/alerts_without_order.sql index 7d92f0f18c7b..4773e16588f7 100644 --- a/ydb/library/yql/tests/sql/suites/match_recognize/alerts_without_order.sql +++ b/ydb/library/yql/tests/sql/suites/match_recognize/alerts_without_order.sql @@ -28,6 +28,7 @@ FROM AS_TABLE($osquery_data) MATCH_RECOGNIZE( LAST(LOGIN_SUCCESS_SAME_USER.user) as brutforce_login ONE ROW PER MATCH + AFTER MATCH SKIP TO NEXT ROW PATTERN ( LOGIN_SUCCESS_REMOTE ANY_ROW* (SUSPICIOUS_ACTION_SOON | SUSPICIOUS_ACTION_TIMEOUT) | (LOGIN_FAILED_SAME_USER ANY_ROW*){2,} LOGIN_SUCCESS_SAME_USER diff --git a/ydb/library/yql/tests/sql/suites/match_recognize/permute.sql b/ydb/library/yql/tests/sql/suites/match_recognize/permute.sql index ff48795cf791..614f5b77e0c7 100644 --- a/ydb/library/yql/tests/sql/suites/match_recognize/permute.sql +++ b/ydb/library/yql/tests/sql/suites/match_recognize/permute.sql @@ -22,6 +22,7 @@ FROM AS_TABLE($data) MATCH_RECOGNIZE( FIRST(B.dt) as b, FIRST(C.dt) as c ONE ROW PER MATCH + AFTER MATCH SKIP TO NEXT ROW PATTERN ( PERMUTE(A, B, C) ) diff --git a/ydb/library/yql/tests/sql/yt_native_file/part5/canondata/result.json b/ydb/library/yql/tests/sql/yt_native_file/part5/canondata/result.json index 1dd48fcb5e40..21197f390700 100644 --- a/ydb/library/yql/tests/sql/yt_native_file/part5/canondata/result.json +++ b/ydb/library/yql/tests/sql/yt_native_file/part5/canondata/result.json @@ -2052,6 +2052,27 @@ "uri": "https://{canondata_backend}/1871182/1ba48914c21beb3df20272c8218b20981a428432/resource.tar.gz#test.test_lineage-window_tablerow-default.txt-Results_/Output.yqlrun.txt.attr" } ], + "test.test[match_recognize-after_match_skip_past_last_row-default.txt-Debug]": [ + { + "checksum": "53f5efe11cf530787e416b733c2b7f53", + "size": 1254, + "uri": "https://{canondata_backend}/1917492/b6d69ba0bdf3cfac8aa79db6bee1738a75d1edc4/resource.tar.gz#test.test_match_recognize-after_match_skip_past_last_row-default.txt-Debug_/opt.yql" + } + ], + "test.test[match_recognize-after_match_skip_past_last_row-default.txt-Plan]": [ + { + "checksum": "b4dd508a329723c74293d80f0278c705", + "size": 505, + "uri": "https://{canondata_backend}/1917492/b6d69ba0bdf3cfac8aa79db6bee1738a75d1edc4/resource.tar.gz#test.test_match_recognize-after_match_skip_past_last_row-default.txt-Plan_/plan.txt" + } + ], + "test.test[match_recognize-after_match_skip_past_last_row-default.txt-Results]": [ + { + "checksum": "6583532367519fd2d47e3f77357d5627", + "size": 1591, + "uri": "https://{canondata_backend}/1917492/b6d69ba0bdf3cfac8aa79db6bee1738a75d1edc4/resource.tar.gz#test.test_match_recognize-after_match_skip_past_last_row-default.txt-Results_/results.txt" + } + ], "test.test[optimizers-test_lmap_opts--Debug]": [ { "checksum": "e0c91cd592cbfa870bdc55a48f225229", From 88241112d8a909a603a739fd471a30cc94fccffb Mon Sep 17 00:00:00 2001 From: uzhastik Date: Wed, 23 Oct 2024 13:50:47 +0300 Subject: [PATCH 46/56] increase grpc call timeout in TMonitoringGrpcServiceActor (#10734) (#10765) --- .../compute_database_control_plane_service.cpp | 8 ++++++-- ydb/library/grpc/actor_client/grpc_service_client.h | 5 ++--- ydb/library/grpc/actor_client/grpc_service_settings.h | 1 + 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/ydb/core/fq/libs/compute/ydb/control_plane/compute_database_control_plane_service.cpp b/ydb/core/fq/libs/compute/ydb/control_plane/compute_database_control_plane_service.cpp index 17aafe28a666..9bc0b761179d 100644 --- a/ydb/core/fq/libs/compute/ydb/control_plane/compute_database_control_plane_service.cpp +++ b/ydb/core/fq/libs/compute/ydb/control_plane/compute_database_control_plane_service.cpp @@ -329,17 +329,21 @@ class TComputeDatabaseControlPlaneServiceActor : public NActors::TActorBootstrap return settings; } - static NGrpcActorClient::TGrpcClientSettings CreateGrpcClientSettings(const NConfig::TComputeDatabaseConfig& config) { + static NGrpcActorClient::TGrpcClientSettings CreateGrpcClientSettings(const auto& connection) { NGrpcActorClient::TGrpcClientSettings settings; - const auto& connection = config.GetControlPlaneConnection(); settings.Endpoint = connection.GetEndpoint(); settings.EnableSsl = connection.GetUseSsl(); if (connection.GetCertificateFile()) { settings.CertificateRootCA = StripString(TFileInput(connection.GetCertificateFile()).ReadAll()); } + settings.RequestTimeoutMs = 20 * 1000; // todo: read from config return settings; } + static NGrpcActorClient::TGrpcClientSettings CreateGrpcClientSettings(const NConfig::TComputeDatabaseConfig& config) { + return CreateGrpcClientSettings(config.GetControlPlaneConnection()); + } + void CreateSingleClientActors(const NConfig::TYdbComputeControlPlane::TSingle& singleConfig) { auto globalLoadConfig = Config.GetYdb().GetLoadControlConfig(); if (globalLoadConfig.GetEnable()) { diff --git a/ydb/library/grpc/actor_client/grpc_service_client.h b/ydb/library/grpc/actor_client/grpc_service_client.h index e51d1483f4c5..53c38ce0db41 100644 --- a/ydb/library/grpc/actor_client/grpc_service_client.h +++ b/ydb/library/grpc/actor_client/grpc_service_client.h @@ -63,8 +63,6 @@ class TGrpcServiceClient { return mask; } - static constexpr TDuration DEFAULT_TIMEOUT = TDuration::Seconds(10); - struct TGrpcRequest { static const google::protobuf::Message& Obfuscate(const google::protobuf::Message& p) { return p; @@ -117,7 +115,8 @@ class TGrpcServiceClient { } static NYdbGrpc::TGRpcClientConfig InitGrpcConfig(const NGrpcActorClient::TGrpcClientSettings& settings) { - NYdbGrpc::TGRpcClientConfig config(settings.Endpoint, DEFAULT_TIMEOUT, NYdbGrpc::DEFAULT_GRPC_MESSAGE_SIZE_LIMIT, 0, settings.CertificateRootCA); + const TDuration requestTimeout = TDuration::MilliSeconds(settings.RequestTimeoutMs); + NYdbGrpc::TGRpcClientConfig config(settings.Endpoint, requestTimeout, NYdbGrpc::DEFAULT_GRPC_MESSAGE_SIZE_LIMIT, 0, settings.CertificateRootCA); config.EnableSsl = settings.EnableSsl; config.IntChannelParams[GRPC_ARG_KEEPALIVE_TIME_MS] = settings.GrpcKeepAliveTimeMs; config.IntChannelParams[GRPC_ARG_KEEPALIVE_TIMEOUT_MS] = settings.GrpcKeepAliveTimeoutMs; diff --git a/ydb/library/grpc/actor_client/grpc_service_settings.h b/ydb/library/grpc/actor_client/grpc_service_settings.h index 0e7d3775ae20..51ccb27e1522 100644 --- a/ydb/library/grpc/actor_client/grpc_service_settings.h +++ b/ydb/library/grpc/actor_client/grpc_service_settings.h @@ -11,6 +11,7 @@ struct TGrpcClientSettings { ui32 GrpcKeepAliveTimeoutMs = 1000; ui32 GrpcKeepAlivePingInterval = 5000; bool EnableSsl = false; + ui64 RequestTimeoutMs = 10000; // 10 seconds }; } // namespace NGrpcActorClient From 96ebce832b1706da23c1e1d71145c64d17c9461c Mon Sep 17 00:00:00 2001 From: Oleg Doronin Date: Wed, 23 Oct 2024 12:58:26 +0200 Subject: [PATCH 47/56] =?UTF-8?q?inflight=20has=20been=20fixed=20for=20TCr?= =?UTF-8?q?eateComputeDatabaseActor=20in=20case=20of=20Ti=E2=80=A6=20(#107?= =?UTF-8?q?67)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ydb/core/fq/libs/control_plane_proxy/control_plane_proxy.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ydb/core/fq/libs/control_plane_proxy/control_plane_proxy.cpp b/ydb/core/fq/libs/control_plane_proxy/control_plane_proxy.cpp index d07c690ac84e..8ee887767b56 100644 --- a/ydb/core/fq/libs/control_plane_proxy/control_plane_proxy.cpp +++ b/ydb/core/fq/libs/control_plane_proxy/control_plane_proxy.cpp @@ -450,6 +450,7 @@ class TCreateComputeDatabaseActor : public NActors::TActorBootstrappedInFly->Dec(); CPP_LOG_W("Create database timeout. CloudId: " << CloudId << " Scope: " << Scope << " Actor id: " << SelfId()); NYql::TIssues issues; NYql::TIssue issue = MakeErrorIssue(TIssuesIds::TIMEOUT, "Create database: request timeout. Try repeating the request later"); From 25d4337fd02b2e5ff1578a8a8ab2ea1f5b264a68 Mon Sep 17 00:00:00 2001 From: yumkam Date: Wed, 23 Oct 2024 15:48:50 +0300 Subject: [PATCH 48/56] streamlookup: fix zero default for MaxDelayedRows backport (#10735) (#10774) --- ydb/library/yql/providers/dq/planner/execution_planner.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ydb/library/yql/providers/dq/planner/execution_planner.cpp b/ydb/library/yql/providers/dq/planner/execution_planner.cpp index 7eec1e4d2648..a4c019b67590 100644 --- a/ydb/library/yql/providers/dq/planner/execution_planner.cpp +++ b/ydb/library/yql/providers/dq/planner/execution_planner.cpp @@ -604,6 +604,7 @@ namespace NYql::NDqs { const auto narrowOutputRowType = GetSeqItemType(streamLookup.Ptr()->GetTypeAnn()); Y_ABORT_UNLESS(narrowOutputRowType->GetKind() == ETypeAnnotationKind::Struct); settings.SetNarrowOutputRowType(NYql::NCommon::GetSerializedTypeAnnotation(narrowOutputRowType)); + settings.SetMaxDelayedRows(1'000'000); //TODO configure me settings.SetCacheLimit(1'000'000); //TODO configure me settings.SetCacheTtlSeconds(60); //TODO configure me From 8b0fa73d75bba7acdefd1520ce3972ea3fc0bb5d Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 23 Oct 2024 16:58:17 +0300 Subject: [PATCH 49/56] YQ-3743 Add message to Y_ABORT_UNLESS in checkpoint_coordinator / to stable (#10755) --- ydb/core/fq/libs/checkpointing/checkpoint_coordinator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ydb/core/fq/libs/checkpointing/checkpoint_coordinator.cpp b/ydb/core/fq/libs/checkpointing/checkpoint_coordinator.cpp index 6ed541976cd3..9baac0b11983 100644 --- a/ydb/core/fq/libs/checkpointing/checkpoint_coordinator.cpp +++ b/ydb/core/fq/libs/checkpointing/checkpoint_coordinator.cpp @@ -73,7 +73,7 @@ void TCheckpointCoordinator::Handle(NYql::NDqs::TEvReadyState::TPtr& ev) { int tasksSize = GetTasksSize(); const auto& actorIds = ev->Get()->Record.GetActorId(); - Y_ABORT_UNLESS(tasksSize == actorIds.size()); + Y_ABORT_UNLESS(tasksSize == actorIds.size(), "tasksSize %d, actorIds size %d, graph id %s", tasksSize, int(actorIds.size()), CoordinatorId.GraphId.c_str()); for (int i = 0; i < tasksSize; ++i) { const auto& task = GetTask(i); From 8ff34f65098a2d543ea9bdb0a4d0a57db69e138a Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 23 Oct 2024 22:38:42 +0300 Subject: [PATCH 50/56] YQ-3766 Shared reading: add unread stats / to stable (#10771) --- .../fq/libs/row_dispatcher/actors_factory.cpp | 4 + .../fq/libs/row_dispatcher/actors_factory.h | 2 + .../libs/row_dispatcher/events/data_plane.h | 8 ++ .../events/topic_session_stats.h | 33 +++++ .../fq/libs/row_dispatcher/row_dispatcher.cpp | 114 +++++++++++++----- .../fq/libs/row_dispatcher/topic_session.cpp | 86 +++++++------ .../fq/libs/row_dispatcher/topic_session.h | 2 + .../row_dispatcher/ut/row_dispatcher_ut.cpp | 2 + .../row_dispatcher/ut/topic_session_ut.cpp | 2 + .../yql/dq/actors/common/retry_queue.cpp | 4 +- .../pq/async_io/dq_pq_rd_read_actor.cpp | 1 + 11 files changed, 190 insertions(+), 68 deletions(-) create mode 100644 ydb/core/fq/libs/row_dispatcher/events/topic_session_stats.h diff --git a/ydb/core/fq/libs/row_dispatcher/actors_factory.cpp b/ydb/core/fq/libs/row_dispatcher/actors_factory.cpp index 287df079ce51..28c6ee11ddbd 100644 --- a/ydb/core/fq/libs/row_dispatcher/actors_factory.cpp +++ b/ydb/core/fq/libs/row_dispatcher/actors_factory.cpp @@ -10,6 +10,8 @@ struct TActorFactory : public IActorFactory { NActors::TActorId RegisterTopicSession( const TString& topicPath, + const TString& endpoint, + const TString& database, const NConfig::TRowDispatcherConfig& config, NActors::TActorId rowDispatcherActorId, ui32 partitionId, @@ -20,6 +22,8 @@ struct TActorFactory : public IActorFactory { auto actorPtr = NFq::NewTopicSession( topicPath, + endpoint, + database, config, rowDispatcherActorId, partitionId, diff --git a/ydb/core/fq/libs/row_dispatcher/actors_factory.h b/ydb/core/fq/libs/row_dispatcher/actors_factory.h index c222522310d0..4363a3b646f0 100644 --- a/ydb/core/fq/libs/row_dispatcher/actors_factory.h +++ b/ydb/core/fq/libs/row_dispatcher/actors_factory.h @@ -13,6 +13,8 @@ struct IActorFactory : public TThrRefBase { virtual NActors::TActorId RegisterTopicSession( const TString& topicPath, + const TString& endpoint, + const TString& database, const NConfig::TRowDispatcherConfig& config, NActors::TActorId rowDispatcherActorId, ui32 partitionId, diff --git a/ydb/core/fq/libs/row_dispatcher/events/data_plane.h b/ydb/core/fq/libs/row_dispatcher/events/data_plane.h index 5cecb5251674..49c13d002030 100644 --- a/ydb/core/fq/libs/row_dispatcher/events/data_plane.h +++ b/ydb/core/fq/libs/row_dispatcher/events/data_plane.h @@ -6,6 +6,7 @@ #include #include +#include namespace NFq { @@ -26,6 +27,7 @@ struct TEvRowDispatcher { EvCoordinatorChangesSubscribe, EvCoordinatorRequest, EvCoordinatorResult, + EvSessionStatistic, EvEnd, }; @@ -120,6 +122,12 @@ struct TEvRowDispatcher { TEvSessionError() = default; NActors::TActorId ReadActorId; }; + + struct TEvSessionStatistic : public NActors::TEventLocal { + TEvSessionStatistic(const TopicSessionStatistic& stat) + : Stat(stat) {} + TopicSessionStatistic Stat; + }; }; } // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/events/topic_session_stats.h b/ydb/core/fq/libs/row_dispatcher/events/topic_session_stats.h new file mode 100644 index 000000000000..b53f6399a047 --- /dev/null +++ b/ydb/core/fq/libs/row_dispatcher/events/topic_session_stats.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include + +namespace NFq { + +struct TopicSessionClientStatistic { + NActors::TActorId ReadActorId; + ui32 PartitionId = 0; + i64 UnreadRows = 0; + i64 UnreadBytes = 0; + ui64 Offset = 0; +}; + +struct TopicSessionCommonStatistic { + ui64 UnreadBytes = 0; +}; + +struct TopicSessionParams { + TString Endpoint; + TString Database; + TString TopicPath; + ui64 PartitionId = 0; +}; + +struct TopicSessionStatistic { + TopicSessionParams SessionKey; + TVector Clients; + TopicSessionCommonStatistic Common; +}; + +} // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp index d9bf3fd9dacb..afe01c867a05 100644 --- a/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp +++ b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -48,16 +49,22 @@ struct TEvPrivate { enum EEv : ui32 { EvBegin = EventSpaceBegin(NActors::TEvents::ES_PRIVATE), EvCoordinatorPing = EvBegin + 20, - EvPrintState, + EvUpdateMetrics, EvEnd }; static_assert(EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE), "expect EvEnd < EventSpaceEnd(NActors::TEvents::ES_PRIVATE)"); struct TEvCoordinatorPing : NActors::TEventLocal {}; - struct TEvPrintState : public NActors::TEventLocal {}; + struct TEvUpdateMetrics : public NActors::TEventLocal {}; }; -ui64 PrintStatePeriodSec = 60; +struct TQueryStat { + const TString QueryId; + NYql::TCounters::TEntry UnreadRows; + NYql::TCounters::TEntry UnreadBytes; +}; + +ui64 UpdateMetricsPeriodSec = 60; class TRowDispatcher : public TActorBootstrapped { @@ -84,19 +91,19 @@ class TRowDispatcher : public TActorBootstrapped { struct TopicSessionKey { TString Endpoint; TString Database; - TString TopicName; + TString TopicPath; ui64 PartitionId; size_t Hash() const noexcept { ui64 hash = std::hash()(Endpoint); hash = CombineHashes(hash, std::hash()(Database)); - hash = CombineHashes(hash, std::hash()(TopicName)); + hash = CombineHashes(hash, std::hash()(TopicPath)); hash = CombineHashes(hash, std::hash()(PartitionId)); return hash; } bool operator==(const TopicSessionKey& other) const { return Endpoint == other.Endpoint && Database == other.Database - && TopicName == other.TopicName && PartitionId == other.PartitionId; + && TopicPath == other.TopicPath && PartitionId == other.PartitionId; } }; @@ -154,10 +161,12 @@ class TRowDispatcher : public TActorBootstrapped { TActorId TopicSessionId; const TString QueryId; ConsumerCounters Counters; + TopicSessionClientStatistic Stat; }; struct SessionInfo { TMap> Consumers; // key - ReadActor actor id + TopicSessionCommonStatistic Stat; }; struct TopicSessionInfo { @@ -198,15 +207,16 @@ class TRowDispatcher : public TActorBootstrapped { void Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev); void Handle(NFq::TEvRowDispatcher::TEvSessionError::TPtr& ev); void Handle(NFq::TEvRowDispatcher::TEvStatus::TPtr& ev); + void Handle(NFq::TEvRowDispatcher::TEvSessionStatistic::TPtr& ev); void Handle(NActors::TEvents::TEvPing::TPtr& ev); void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvRetry::TPtr&); void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvPing::TPtr&); void Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed::TPtr&); - void Handle(NFq::TEvPrivate::TEvPrintState::TPtr&); + void Handle(NFq::TEvPrivate::TEvUpdateMetrics::TPtr&); void DeleteConsumer(const ConsumerSessionKey& key); - void PrintInternalState(); + void UpdateMetrics(); STRICT_STFUNC( StateFunc, { @@ -223,12 +233,13 @@ class TRowDispatcher : public TActorBootstrapped { hFunc(NFq::TEvRowDispatcher::TEvStopSession, Handle); hFunc(NFq::TEvRowDispatcher::TEvSessionError, Handle); hFunc(NFq::TEvRowDispatcher::TEvStatus, Handle); + hFunc(NFq::TEvRowDispatcher::TEvSessionStatistic, Handle); hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvRetry, Handle); hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvPing, Handle); hFunc(NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed, Handle); hFunc(NActors::TEvents::TEvPing, Handle); hFunc(NFq::TEvRowDispatcher::TEvNewDataArrived, Handle); - hFunc(NFq::TEvPrivate::TEvPrintState, Handle); + hFunc(NFq::TEvPrivate::TEvUpdateMetrics, Handle); }) }; @@ -261,7 +272,7 @@ void TRowDispatcher::Bootstrap() { auto coordinatorId = Register(NewCoordinator(SelfId(), config, YqSharedResources, Tenant, Counters).release()); Register(NewLeaderElection(SelfId(), coordinatorId, config, CredentialsProviderFactory, YqSharedResources, Tenant, Counters).release()); Schedule(TDuration::Seconds(CoordinatorPingPeriodSec), new TEvPrivate::TEvCoordinatorPing()); - Schedule(TDuration::Seconds(PrintStatePeriodSec), new NFq::TEvPrivate::TEvPrintState()); + Schedule(TDuration::Seconds(UpdateMetricsPeriodSec), new NFq::TEvPrivate::TEvUpdateMetrics()); } void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvCoordinatorChanged::TPtr& ev) { @@ -320,31 +331,42 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvCoordinatorChangesSubscrib Send(ev->Sender, new NFq::TEvRowDispatcher::TEvCoordinatorChanged(*CoordinatorActorId), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession); } -void TRowDispatcher::PrintInternalState() { +void TRowDispatcher::UpdateMetrics() { if (Consumers.empty()) { return; } + TMap queryStats; TStringStream str; - str << "Consumers:\n"; - for (auto& [key, consumerInfo] : Consumers) { - str << " query id " << consumerInfo->QueryId << ", partId: " << key.PartitionId << ", read actor id: " << key.ReadActorId - << ", queueId " << consumerInfo->EventQueueId << ", get next " << consumerInfo->Counters.GetNextBatch - << ", data arrived " << consumerInfo->Counters.NewDataArrived << ", message batch " << consumerInfo->Counters.MessageBatch << "\n"; - str << " "; - consumerInfo->EventsQueue.PrintInternalState(str); - } - str << "\nSessions:\n"; - for (auto& [key, sessionInfo1] : TopicSessions) { - str << " " << key.Endpoint << " / " << key.Database << " / " << key.TopicName << ", id: " << key.PartitionId << "\n"; - for (auto& [actorId, sessionInfo2] : sessionInfo1.Sessions) { - str << " session id: " << actorId << "\n"; - for (auto& [actorId2, consumer] : sessionInfo2.Consumers) { - str << " read actor id: " << actorId2 << "\n"; + str << "Statistics:\n"; + for (auto& [key, sessionsInfo] : TopicSessions) { + str << " " << key.Endpoint << " / " << key.Database << " / " << key.TopicPath << " / " << key.PartitionId; + for (auto& [actorId, sessionInfo] : sessionsInfo.Sessions) { + str << " / " << actorId << "\n"; + str << " unread bytes " << sessionInfo.Stat.UnreadBytes << "\n"; + for (auto& [readActorId, consumer] : sessionInfo.Consumers) { + auto& stat = queryStats[consumer->QueryId]; + stat.UnreadRows.Add(NYql::TCounters::TEntry(consumer->Stat.UnreadRows)); + stat.UnreadBytes.Add(NYql::TCounters::TEntry(consumer->Stat.UnreadBytes)); + str << " " << consumer->QueryId << " " << readActorId << " unread rows " + << consumer->Stat.UnreadRows << " unread bytes " << consumer->Stat.UnreadBytes << " offset " << consumer->Stat.Offset + << " get " << consumer->Counters.GetNextBatch + << " arrived " << consumer->Counters.NewDataArrived << " batch " << consumer->Counters.MessageBatch << " "; + str << " retry queue: "; + consumer->EventsQueue.PrintInternalState(str); } } } LOG_ROW_DISPATCHER_DEBUG(str.Str()); + + for (const auto& [queryId, stat] : queryStats) { + LOG_ROW_DISPATCHER_DEBUG("UnreadBytes " << queryId << " " << stat.UnreadBytes.Max); + auto queryGroup = Metrics.Counters->GetSubgroup("queryId", queryId); + queryGroup->GetCounter("MaxUnreadRows")->Set(stat.UnreadRows.Max); + queryGroup->GetCounter("AvgUnreadRows")->Set(stat.UnreadRows.Avg); + queryGroup->GetCounter("MaxUnreadBytes")->Set(stat.UnreadBytes.Max); + queryGroup->GetCounter("AvgUnreadBytes")->Set(stat.UnreadBytes.Avg); + } } void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) { @@ -383,6 +405,8 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) { LOG_ROW_DISPATCHER_DEBUG("Create new session " << readOffset); sessionActorId = ActorFactory->RegisterTopicSession( source.GetTopicPath(), + source.GetEndpoint(), + source.GetDatabase(), Config, SelfId(), ev->Get()->Record.GetPartitionId(), @@ -407,7 +431,7 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) { Forward(ev, sessionActorId); Metrics.ClientsCount->Set(Consumers.size()); - PrintInternalState(); + UpdateMetrics(); } void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvGetNextBatch::TPtr& ev) { @@ -490,7 +514,7 @@ void TRowDispatcher::DeleteConsumer(const ConsumerSessionKey& key) { ConsumersByEventQueueId.erase(consumerIt->second->EventQueueId); Consumers.erase(consumerIt); Metrics.ClientsCount->Set(Consumers.size()); - PrintInternalState(); + UpdateMetrics(); } void TRowDispatcher::Handle(const NYql::NDq::TEvRetryQueuePrivate::TEvSessionClosed::TPtr& ev) { @@ -577,9 +601,37 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvStatus::TPtr& ev) { it->second->EventsQueue.Send(ev.Release()->Release().Release()); } -void TRowDispatcher::Handle(NFq::TEvPrivate::TEvPrintState::TPtr&) { - Schedule(TDuration::Seconds(PrintStatePeriodSec), new NFq::TEvPrivate::TEvPrintState()); - PrintInternalState(); +void TRowDispatcher::Handle(NFq::TEvPrivate::TEvUpdateMetrics::TPtr&) { + Schedule(TDuration::Seconds(UpdateMetricsPeriodSec), new NFq::TEvPrivate::TEvUpdateMetrics()); + UpdateMetrics(); +} + +void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvSessionStatistic::TPtr& ev) { + LOG_ROW_DISPATCHER_TRACE("TEvSessionStatistic from " << ev->Sender); + const auto& key = ev->Get()->Stat.SessionKey; + TopicSessionKey sessionKey{key.Endpoint, key.Database, key.TopicPath, key.PartitionId}; + + auto sessionsIt = TopicSessions.find(sessionKey); + if (sessionsIt == TopicSessions.end()) { + return; + } + auto& sessionsInfo = sessionsIt->second; + auto sessionIt = sessionsInfo.Sessions.find(ev->Sender); + if (sessionIt == sessionsInfo.Sessions.end()) { + return; + } + + auto& sessionInfo = sessionIt->second; + sessionInfo.Stat = ev->Get()->Stat.Common; + + for (const auto& clientStat : ev->Get()->Stat.Clients) { + auto it = sessionInfo.Consumers.find(clientStat.ReadActorId); + if (it == sessionInfo.Consumers.end()) { + continue; + } + auto consumerInfoPtr = it->second; + consumerInfoPtr->Stat = clientStat; + } } } // namespace diff --git a/ydb/core/fq/libs/row_dispatcher/topic_session.cpp b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp index f87669d1f3ea..fb54e82540cd 100644 --- a/ydb/core/fq/libs/row_dispatcher/topic_session.cpp +++ b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp @@ -54,7 +54,7 @@ struct TEvPrivate { EvStatus, EvDataAfterFilteration, EvDataFiltered, - EvPrintState, + EvSendStatistic, EvStartParsing, EvEnd }; @@ -63,7 +63,7 @@ struct TEvPrivate { // Events struct TEvPqEventsReady : public NActors::TEventLocal {}; struct TEvCreateSession : public NActors::TEventLocal {}; - struct TEvPrintState : public NActors::TEventLocal {}; + struct TEvSendStatistic : public NActors::TEventLocal {}; struct TEvStatus : public NActors::TEventLocal {}; struct TEvStartParsing : public NActors::TEventLocal {}; @@ -85,7 +85,7 @@ struct TEvPrivate { }; }; -ui64 PrintStatePeriodSec = 60; +ui64 SendStatisticPeriodSec = 5; ui64 MaxBatchSizeBytes = 10000000; ui64 MaxHandledEvents = 1000; @@ -110,7 +110,7 @@ class TTopicSession : public TActorBootstrapped { NActors::TActorId ReadActorId; std::unique_ptr Filter; // empty if no predicate TQueue> Buffer; - ui64 UsedSize = 0; + ui64 UnreadBytes = 0; bool DataArrivedSent = false; TMaybe NextMessageOffset; ui64 LastSendedNextMessageOffset = 0; @@ -137,6 +137,8 @@ class TTopicSession : public TActorBootstrapped { }; const TString TopicPath; + const TString Endpoint; + const TString Database; NActors::TActorId RowDispatcherActorId; ui32 PartitionId; NYdb::TDriver Driver; @@ -153,7 +155,7 @@ class TTopicSession : public TActorBootstrapped { THashSet ClientsWithoutPredicate; std::unique_ptr Parser; NConfig::TRowDispatcherConfig Config; - ui64 UsedSize = 0; + ui64 UnreadBytes = 0; const ::NMonitoring::TDynamicCounterPtr Counters; TTopicSessionMetrics Metrics; TParserSchema ParserSchema; @@ -163,6 +165,8 @@ class TTopicSession : public TActorBootstrapped { public: explicit TTopicSession( const TString& topicPath, + const TString& endpoint, + const TString& database, const NConfig::TRowDispatcherConfig& config, NActors::TActorId rowDispatcherActorId, ui32 partitionId, @@ -203,13 +207,13 @@ class TTopicSession : public TActorBootstrapped { void Handle(NFq::TEvPrivate::TEvDataAfterFilteration::TPtr&); void Handle(NFq::TEvPrivate::TEvStatus::TPtr&); void Handle(NFq::TEvPrivate::TEvDataFiltered::TPtr&); - void Handle(NFq::TEvPrivate::TEvPrintState::TPtr&); + void Handle(NFq::TEvPrivate::TEvSendStatistic::TPtr&); void Handle(TEvRowDispatcher::TEvGetNextBatch::TPtr&); void Handle(NFq::TEvRowDispatcher::TEvStopSession::TPtr& ev); void Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev); void HandleException(const std::exception& err); - void PrintInternalState(); + void SendStatistic(); void SendSessionError(NActors::TActorId readActorId, const TString& message); TVector> RebuildJson(const ClientsInfo& info, const TVector>& parsedValues); void UpdateParserSchema(const TParserInputType& inputType); @@ -223,7 +227,7 @@ class TTopicSession : public TActorBootstrapped { hFunc(NFq::TEvPrivate::TEvDataAfterFilteration, Handle); hFunc(NFq::TEvPrivate::TEvStatus, Handle); hFunc(NFq::TEvPrivate::TEvDataFiltered, Handle); - hFunc(NFq::TEvPrivate::TEvPrintState, Handle); + hFunc(NFq::TEvPrivate::TEvSendStatistic, Handle); hFunc(TEvRowDispatcher::TEvGetNextBatch, Handle); hFunc(NFq::TEvRowDispatcher::TEvStartSession, Handle); sFunc(NFq::TEvPrivate::TEvStartParsing, DoParsing); @@ -242,12 +246,14 @@ class TTopicSession : public TActorBootstrapped { IgnoreFunc(TEvRowDispatcher::TEvGetNextBatch); IgnoreFunc(NFq::TEvRowDispatcher::TEvStartSession); IgnoreFunc(NFq::TEvRowDispatcher::TEvStopSession); - IgnoreFunc(NFq::TEvPrivate::TEvPrintState); + IgnoreFunc(NFq::TEvPrivate::TEvSendStatistic); }) }; TTopicSession::TTopicSession( const TString& topicPath, + const TString& endpoint, + const TString& database, const NConfig::TRowDispatcherConfig& config, NActors::TActorId rowDispatcherActorId, ui32 partitionId, @@ -256,6 +262,8 @@ TTopicSession::TTopicSession( const ::NMonitoring::TDynamicCounterPtr& counters, const NYql::IPqGateway::TPtr& pqGateway) : TopicPath(topicPath) + , Endpoint(endpoint) + , Database(database) , RowDispatcherActorId(rowDispatcherActorId) , PartitionId(partitionId) , Driver(std::move(driver)) @@ -276,7 +284,7 @@ void TTopicSession::Bootstrap() { << ", Timeout " << Config.GetTimeoutBeforeStartSessionSec() << " sec, StatusPeriod " << Config.GetSendStatusPeriodSec() << " sec"); Y_ENSURE(Config.GetSendStatusPeriodSec() > 0); Schedule(TDuration::Seconds(Config.GetSendStatusPeriodSec()), new NFq::TEvPrivate::TEvStatus()); - Schedule(TDuration::Seconds(PrintStatePeriodSec), new NFq::TEvPrivate::TEvPrintState()); + Schedule(TDuration::Seconds(SendStatisticPeriodSec), new NFq::TEvPrivate::TEvSendStatistic()); } void TTopicSession::PassAway() { @@ -290,8 +298,8 @@ void TTopicSession::SubscribeOnNextEvent() { return; } - if (Config.GetMaxSessionUsedMemory() && UsedSize > Config.GetMaxSessionUsedMemory()) { - LOG_ROW_DISPATCHER_TRACE("Too much used memory (" << UsedSize << " bytes), skip subscribing to WaitEvent()"); + if (Config.GetMaxSessionUsedMemory() && UnreadBytes > Config.GetMaxSessionUsedMemory()) { + LOG_ROW_DISPATCHER_TRACE("Too much used memory (" << UnreadBytes << " bytes), skip subscribing to WaitEvent()"); return; } @@ -306,8 +314,8 @@ void TTopicSession::SubscribeOnNextEvent() { NYdb::NTopic::TTopicClientSettings TTopicSession::GetTopicClientSettings(const NYql::NPq::NProto::TDqPqTopicSource& sourceParams) const { NYdb::NTopic::TTopicClientSettings opts; - opts.Database(sourceParams.GetDatabase()) - .DiscoveryEndpoint(sourceParams.GetEndpoint()) + opts.Database(Database) + .DiscoveryEndpoint(Endpoint) .SslCredentials(NYdb::TSslCredentials(sourceParams.GetUseSsl())) .CredentialsProviderFactory(CredentialsProviderFactory); return opts; @@ -448,8 +456,8 @@ void TTopicSession::HandleNewEvents() { if (!ReadSession) { return; } - if (Config.GetMaxSessionUsedMemory() && UsedSize > Config.GetMaxSessionUsedMemory()) { - LOG_ROW_DISPATCHER_TRACE("Too much used memory (" << UsedSize << " bytes), stop reading from yds"); + if (Config.GetMaxSessionUsedMemory() && UnreadBytes > Config.GetMaxSessionUsedMemory()) { + LOG_ROW_DISPATCHER_TRACE("Too much used memory (" << UnreadBytes << " bytes), stop reading from yds"); break; } TMaybe event = ReadSession->GetEvent(false); @@ -608,8 +616,8 @@ void TTopicSession::SendData(ClientsInfo& info) { ui64 batchSize = 0; while (!info.Buffer.empty()) { const auto& [offset, json] = info.Buffer.front(); - info.UsedSize -= json.size(); - UsedSize -= json.size(); + info.UnreadBytes -= json.size(); + UnreadBytes -= json.size(); batchSize += json.size(); NFq::NRowDispatcherProto::TEvMessage message; message.SetJson(json); @@ -696,7 +704,7 @@ void TTopicSession::Handle(NFq::TEvRowDispatcher::TEvStartSession::TPtr& ev) { FatalError("Adding new client failed, " + CurrentExceptionMessage()); } UpdateParser(); - PrintInternalState(); + SendStatistic(); if (!ReadSession) { Schedule(TDuration::Seconds(Config.GetTimeoutBeforeStartSessionSec()), new NFq::TEvPrivate::TEvCreateSession()); } @@ -708,8 +716,8 @@ void TTopicSession::AddDataToClient(ClientsInfo& info, ui64 offset, const TStrin } info.NextMessageOffset = offset + 1; info.Buffer.push(std::make_pair(offset, json)); - info.UsedSize += json.size(); - UsedSize += json.size(); + info.UnreadBytes += json.size(); + UnreadBytes += json.size(); SendDataArrived(info); } @@ -723,7 +731,7 @@ void TTopicSession::Handle(NFq::TEvRowDispatcher::TEvStopSession::TPtr& ev) { return; } auto& info = it->second; - UsedSize -= info.UsedSize; + UnreadBytes -= info.UnreadBytes; Clients.erase(it); ClientsWithoutPredicate.erase(ev->Sender); if (Clients.empty()) { @@ -848,21 +856,27 @@ void TTopicSession::HandleException(const std::exception& e) { FatalError(TString("Internal error: exception: ") + e.what()); } -void TTopicSession::PrintInternalState() { - TStringStream str; - str << "Clients:\n"; - str << "UsedSize: " << UsedSize << "\n"; +void TTopicSession::SendStatistic() { + TopicSessionStatistic stat; + stat.Common.UnreadBytes = UnreadBytes; + stat.SessionKey = TopicSessionParams{Endpoint, Database, TopicPath, PartitionId}; + stat.Clients.reserve(Clients.size()); for (auto& [readActorId, info] : Clients) { - str << " read actor id " << readActorId << ", buffer size " << info.Buffer.size() - << ", used size: " << info.UsedSize << ", data arrived sent " << info.DataArrivedSent - << ", next offset " << info.NextMessageOffset << "\n"; - } - LOG_ROW_DISPATCHER_DEBUG(str.Str()); + TopicSessionClientStatistic client; + client.PartitionId = PartitionId; + client.ReadActorId = readActorId; + client.UnreadRows = info.Buffer.size(); + client.UnreadBytes = info.UnreadBytes; + client.Offset = info.NextMessageOffset.GetOrElse(0); + stat.Clients.emplace_back(std::move(client)); + } + auto event = std::make_unique(stat); + Send(RowDispatcherActorId, event.release()); } -void TTopicSession::Handle(NFq::TEvPrivate::TEvPrintState::TPtr&) { - Schedule(TDuration::Seconds(PrintStatePeriodSec), new NFq::TEvPrivate::TEvPrintState()); - PrintInternalState(); +void TTopicSession::Handle(NFq::TEvPrivate::TEvSendStatistic::TPtr&) { + Schedule(TDuration::Seconds(SendStatisticPeriodSec), new NFq::TEvPrivate::TEvSendStatistic()); + SendStatistic(); } } // namespace @@ -871,6 +885,8 @@ void TTopicSession::Handle(NFq::TEvPrivate::TEvPrintState::TPtr&) { std::unique_ptr NewTopicSession( const TString& topicPath, + const TString& endpoint, + const TString& database, const NConfig::TRowDispatcherConfig& config, NActors::TActorId rowDispatcherActorId, ui32 partitionId, @@ -878,7 +894,7 @@ std::unique_ptr NewTopicSession( std::shared_ptr credentialsProviderFactory, const ::NMonitoring::TDynamicCounterPtr& counters, const NYql::IPqGateway::TPtr& pqGateway) { - return std::unique_ptr(new TTopicSession(topicPath, config, rowDispatcherActorId, partitionId, std::move(driver), credentialsProviderFactory, counters, pqGateway)); + return std::unique_ptr(new TTopicSession(topicPath, endpoint, database, config, rowDispatcherActorId, partitionId, std::move(driver), credentialsProviderFactory, counters, pqGateway)); } } // namespace NFq diff --git a/ydb/core/fq/libs/row_dispatcher/topic_session.h b/ydb/core/fq/libs/row_dispatcher/topic_session.h index 17ca62dda546..24a00be2c367 100644 --- a/ydb/core/fq/libs/row_dispatcher/topic_session.h +++ b/ydb/core/fq/libs/row_dispatcher/topic_session.h @@ -17,6 +17,8 @@ namespace NFq { std::unique_ptr NewTopicSession( const TString& topicPath, + const TString& endpoint, + const TString& database, const NConfig::TRowDispatcherConfig& config, NActors::TActorId rowDispatcherActorId, ui32 partitionId, diff --git a/ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp index 71eb34c58716..8186ee2e4384 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/row_dispatcher_ut.cpp @@ -28,6 +28,8 @@ struct TTestActorFactory : public NFq::NRowDispatcher::IActorFactory { NActors::TActorId RegisterTopicSession( const TString& /*topicPath*/, + const TString& /*endpoint*/, + const TString& /*database*/, const NConfig::TRowDispatcherConfig& /*config*/, NActors::TActorId /*rowDispatcherActorId*/, ui32 /*partitionId*/, diff --git a/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp index d79edab956ae..75b4ab84e648 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp @@ -60,6 +60,8 @@ class TFixture : public NUnitTest::TBaseFixture { TopicSession = Runtime.Register(NewTopicSession( topicPath, + GetDefaultPqEndpoint(), + GetDefaultPqDatabase(), Config, RowDispatcherActorId, 0, diff --git a/ydb/library/yql/dq/actors/common/retry_queue.cpp b/ydb/library/yql/dq/actors/common/retry_queue.cpp index 1209f954d5e5..ba19940e8ed4 100644 --- a/ydb/library/yql/dq/actors/common/retry_queue.cpp +++ b/ydb/library/yql/dq/actors/common/retry_queue.cpp @@ -170,8 +170,8 @@ TDuration TRetryEventsQueue::TRetryState::RandomizeDelay(TDuration baseDelay) { } void TRetryEventsQueue::PrintInternalState(TStringStream& stream) const { - stream << "RetryQueue: id " << EventQueueId << ", NextSeqNo " - << NextSeqNo << ", MyConfirmedSeqNo " << MyConfirmedSeqNo << ", SeqNos " << ReceivedEventsSeqNos.size() << ", events size " << Events.size() << "\n"; + stream << "id " << EventQueueId << ", NextSeqNo " + << NextSeqNo << ", MyConfSeqNo " << MyConfirmedSeqNo << ", SeqNos " << ReceivedEventsSeqNos.size() << ", events size " << Events.size() << "\n"; } diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp b/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp index 91ec056ff986..6f4e77507197 100644 --- a/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp +++ b/ydb/library/yql/providers/pq/async_io/dq_pq_rd_read_actor.cpp @@ -633,6 +633,7 @@ void TDqPqRdReadActor::Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev) SRC_LOG_W("Ignore TEvMessageBatch from " << ev->Sender << ", seqNo " << meta.GetSeqNo() << ", ConfirmedSeqNo " << meta.GetConfirmedSeqNo() << ", PartitionId " << partitionId); YQL_ENSURE(State != EState::STARTED); + return; } Metrics.InFlyGetNextBatch->Set(0); From 40620ffbba9d66081d6b2debf421efd16fbdcd8b Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 24 Oct 2024 10:01:39 +0300 Subject: [PATCH 51/56] YQ-3786 heap use after free / to stable (#10796) Co-authored-by: Alek5andr-Kotov --- ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp b/ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp index be8c7e128e22..6030ee86956f 100644 --- a/ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp +++ b/ydb/tests/fq/pq_async_io/ut/dq_pq_rd_read_actor_ut.cpp @@ -68,7 +68,7 @@ struct TFixture : public TPqIoTestFixture { auto ExpectCoordinatorRequest(NActors::TActorId coordinatorId) { auto eventHolder = CaSetup->Runtime->GrabEdgeEvent(coordinatorId, TDuration::Seconds(5)); UNIT_ASSERT(eventHolder.Get() != nullptr); - return eventHolder.Get(); + return eventHolder; } void ExpectStartSession(ui64 expectedOffset, NActors::TActorId rowDispatcherId) { From 63903b04771c5d08d2d553b02a71737e3ddeff0b Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 24 Oct 2024 12:36:11 +0300 Subject: [PATCH 52/56] YQ-3775 Shared reading: try to fix memory leak in topic session / to stable (#10817) --- ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp index afe01c867a05..fa1806c40a53 100644 --- a/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp +++ b/ydb/core/fq/libs/row_dispatcher/row_dispatcher.cpp @@ -558,7 +558,7 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvNewDataArrived::TPtr& ev) } LOG_ROW_DISPATCHER_TRACE("Forward TEvNewDataArrived to " << ev->Get()->ReadActorId); it->second->Counters.NewDataArrived++; - it->second->EventsQueue.Send(ev.Release()->Release().Release()); + it->second->EventsQueue.Send(ev->Release().Release()); } void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev) { @@ -572,7 +572,7 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvMessageBatch::TPtr& ev) { Metrics.RowsSent->Add(ev->Get()->Record.MessagesSize()); LOG_ROW_DISPATCHER_TRACE("Forward TEvMessageBatch to " << ev->Get()->ReadActorId); it->second->Counters.MessageBatch++; - it->second->EventsQueue.Send(ev.Release()->Release().Release()); + it->second->EventsQueue.Send(ev->Release().Release()); } void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvSessionError::TPtr& ev) { @@ -585,7 +585,7 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvSessionError::TPtr& ev) { } Metrics.ErrorsCount->Inc(); LOG_ROW_DISPATCHER_TRACE("Forward TEvSessionError to " << ev->Get()->ReadActorId); - it->second->EventsQueue.Send(ev.Release()->Release().Release()); + it->second->EventsQueue.Send(ev->Release().Release()); DeleteConsumer(key); } @@ -598,7 +598,7 @@ void TRowDispatcher::Handle(NFq::TEvRowDispatcher::TEvStatus::TPtr& ev) { return; } LOG_ROW_DISPATCHER_TRACE("Forward TEvStatus to " << ev->Get()->ReadActorId); - it->second->EventsQueue.Send(ev.Release()->Release().Release()); + it->second->EventsQueue.Send(ev->Release().Release()); } void TRowDispatcher::Handle(NFq::TEvPrivate::TEvUpdateMetrics::TPtr&) { From 84a2fb2014d69810fabb5a21ab87afa3d813d527 Mon Sep 17 00:00:00 2001 From: Hor911 Date: Thu, 24 Oct 2024 15:44:16 +0300 Subject: [PATCH 53/56] Better plans + bad plans handling (#10820) --- ydb/core/fq/libs/compute/common/utils.cpp | 48 +++--- ydb/core/fq/libs/compute/common/utils.h | 4 +- ydb/core/fq/libs/compute/common/ya.make | 1 + .../libs/compute/ydb/status_tracker_actor.cpp | 3 + .../fq/libs/compute/ydb/stopper_actor.cpp | 1 + ydb/public/lib/ydb_cli/common/plan2svg.cpp | 146 +++++++++++++++++- ydb/public/lib/ydb_cli/common/plan2svg.h | 1 + ydb/tests/tools/kqprun/src/kqp_runner.cpp | 2 +- 8 files changed, 172 insertions(+), 34 deletions(-) diff --git a/ydb/core/fq/libs/compute/common/utils.cpp b/ydb/core/fq/libs/compute/common/utils.cpp index 40c06436fb4c..1eb26160912a 100644 --- a/ydb/core/fq/libs/compute/common/utils.cpp +++ b/ydb/core/fq/libs/compute/common/utils.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace NFq { @@ -431,7 +432,7 @@ void EnumeratePlans(NYson::TYsonWriter& writer, NJson::TJsonValue& value, ui32& } } -TString GetV1StatFromV2Plan(const TString& plan, double* cpuUsage, TString* timeline, ui64 maxTimelineSize) { +TString GetV1StatFromV2Plan(const TString& plan, double* cpuUsage, TString* timeline) { TStringStream out; NYson::TYsonWriter writer(&out); writer.OnBeginMap(); @@ -475,20 +476,7 @@ TString GetV1StatFromV2Plan(const TString& plan, double* cpuUsage, TString* time if (timeline) { TPlanVisualizer planViz; planViz.LoadPlans(plan); - *timeline = planViz.PrintSvgSafe(); - if (maxTimelineSize && timeline->size() > maxTimelineSize) { - TStringBuilder builder; - builder - << "" << Endl - << " There is nothing wrong with the request." << Endl - << " Unfortunately, image size " << timeline->size() << " is too large." << Endl - << " It exceeds limit of " << maxTimelineSize << " and was discarded" << Endl - << "" << Endl; - *timeline = builder; - } - // remove json "timeline" field after migration - writer.OnKeyedItem("timeline"); - writer.OnStringScalar(*timeline); + *timeline = planViz.PrintSvg(); } writer.OnEndMap(); return NJson2Yson::ConvertYson2Json(out.Str()); @@ -1164,7 +1152,7 @@ struct TNoneStatProcessor : IPlanStatProcessor { return plan; } - TString GetQueryStat(const TString&, double& cpuUsage, TString*, ui64) override { + TString GetQueryStat(const TString&, double& cpuUsage, TString*) override { cpuUsage = 0.0; return ""; } @@ -1197,8 +1185,8 @@ struct TPlanStatProcessor : IPlanStatProcessor { return plan; } - TString GetQueryStat(const TString& plan, double& cpuUsage, TString* timeline, ui64 maxtimelineSize) override { - return GetV1StatFromV2Plan(plan, &cpuUsage, timeline, maxtimelineSize); + TString GetQueryStat(const TString& plan, double& cpuUsage, TString* timeline) override { + return GetV1StatFromV2Plan(plan, &cpuUsage, timeline); } TPublicStat GetPublicStat(const TString& stat) override { @@ -1229,8 +1217,8 @@ struct TProfileStatProcessor : TPlanStatProcessor { }; struct TProdStatProcessor : TFullStatProcessor { - TString GetQueryStat(const TString& plan, double& cpuUsage, TString* timeline, ui64 maxtimelineSize) override { - return GetPrettyStatistics(GetV1StatFromV2Plan(plan, &cpuUsage, timeline, maxtimelineSize)); + TString GetQueryStat(const TString& plan, double& cpuUsage, TString* timeline) override { + return GetPrettyStatistics(GetV1StatFromV2Plan(plan, &cpuUsage, timeline)); } }; @@ -1263,10 +1251,18 @@ Fq::Private::PingTaskRequest PingTaskRequestBuilder::Build( ) { Fq::Private::PingTaskRequest pingTaskRequest = Build(queryStats); + // Application-level issues, pass as is if (issues) { NYql::IssuesToMessage(issues, pingTaskRequest.mutable_issues()); } + // Builder own (internal) issues will be logged later, just warn the user + if (Issues) { + auto* issue = pingTaskRequest.add_issues(); + issue->set_message("There are minor issues with query statistics processing. You can supply query ID and ask support for the information."); + issue->set_severity(NYql::TSeverityIds::S_WARNING); + } + if (computeStatus) { pingTaskRequest.set_status(*computeStatus); } @@ -1318,7 +1314,13 @@ Fq::Private::PingTaskRequest PingTaskRequestBuilder::Build(const TString& queryP CpuUsage = 0.0; try { TString timeline; - auto stat = Processor->GetQueryStat(plan, CpuUsage, ShowQueryTimeline ? &timeline : nullptr, MaxQueryTimelineSize); + auto stat = Processor->GetQueryStat(plan, CpuUsage, ShowQueryTimeline ? &timeline : nullptr); + + if (MaxQueryTimelineSize && timeline.size() > MaxQueryTimelineSize) { + Issues.AddIssue(NYql::TIssue(TStringBuilder() << "Timeline size " << timeline.size() << " exceeds limit of " << MaxQueryTimelineSize)); + timeline = ""; + } + pingTaskRequest.set_statistics(stat); pingTaskRequest.set_dump_raw_statistics(true); if (timeline) { @@ -1329,8 +1331,8 @@ Fq::Private::PingTaskRequest PingTaskRequestBuilder::Build(const TString& queryP flatStat["ComputeTimeUs"] = computeTimeUs; SerializeStats(*pingTaskRequest.mutable_flat_stats(), flatStat); PublicStat = Processor->GetPublicStat(stat); - } catch(const NJson::TJsonException& ex) { - Issues.AddIssue(NYql::TIssue(TStringBuilder() << "Error stat conversion: " << ex.what())); + } catch (const std::exception& e) { + Issues.AddIssue(NYql::TIssue(TStringBuilder() << "Error stat processing: " << e.what())); } return pingTaskRequest; diff --git a/ydb/core/fq/libs/compute/common/utils.h b/ydb/core/fq/libs/compute/common/utils.h index efd11787838d..e0be663cc3c4 100644 --- a/ydb/core/fq/libs/compute/common/utils.h +++ b/ydb/core/fq/libs/compute/common/utils.h @@ -28,7 +28,7 @@ inline std::shared_ptr CreateNewTableClient(const TS tableSettings); } -TString GetV1StatFromV2Plan(const TString& plan, double* cpuUsage = nullptr, TString* timeline = nullptr, ui64 maxTimelineSize = 0); +TString GetV1StatFromV2Plan(const TString& plan, double* cpuUsage = nullptr, TString* timeline = nullptr); TString GetV1StatFromV2PlanV2(const TString& plan); TString GetPrettyStatistics(const TString& statistics); THashMap AggregateStats(TStringBuf plan); @@ -55,7 +55,7 @@ struct IPlanStatProcessor { virtual Ydb::Query::StatsMode GetStatsMode() = 0; virtual TString ConvertPlan(const TString& plan) = 0; virtual TString GetPlanVisualization(const TString& plan) = 0; - virtual TString GetQueryStat(const TString& plan, double& cpuUsage, TString* timeline, ui64 maxtimelineSize) = 0; + virtual TString GetQueryStat(const TString& plan, double& cpuUsage, TString* timeline) = 0; virtual TPublicStat GetPublicStat(const TString& stat) = 0; virtual THashMap GetFlatStat(TStringBuf plan) = 0; }; diff --git a/ydb/core/fq/libs/compute/common/ya.make b/ydb/core/fq/libs/compute/common/ya.make index efdb54097732..fa776cc7f6fe 100644 --- a/ydb/core/fq/libs/compute/common/ya.make +++ b/ydb/core/fq/libs/compute/common/ya.make @@ -14,6 +14,7 @@ PEERDIR( ydb/core/fq/libs/grpc ydb/core/fq/libs/shared_resources ydb/library/yql/public/issue + ydb/library/yql/public/issue/protos ydb/library/yql/providers/common/http_gateway ydb/library/yql/providers/dq/provider ydb/library/yql/providers/generic/connector/api/service/protos diff --git a/ydb/core/fq/libs/compute/ydb/status_tracker_actor.cpp b/ydb/core/fq/libs/compute/ydb/status_tracker_actor.cpp index 062cf855d83f..d960accd7eec 100644 --- a/ydb/core/fq/libs/compute/ydb/status_tracker_actor.cpp +++ b/ydb/core/fq/libs/compute/ydb/status_tracker_actor.cpp @@ -228,6 +228,7 @@ class TStatusTrackerActor : public TBaseComputeActor { Fq::Private::PingTaskRequest pingTaskRequest = Builder.Build(QueryStats, Issues); if (Builder.Issues) { LOG_W(Builder.Issues.ToOneLineString()); + GetStepCountersSubgroup()->GetCounter("StatIssues", true)->Inc(); } ReportPublicCounters(Builder.PublicStat); Send(Pinger, new TEvents::TEvForwardPingRequest(pingTaskRequest), 0, 1); @@ -248,6 +249,7 @@ class TStatusTrackerActor : public TBaseComputeActor { Fq::Private::PingTaskRequest pingTaskRequest = Builder.Build(QueryStats, Issues, std::nullopt, StatusCode); if (Builder.Issues) { LOG_W(Builder.Issues.ToOneLineString()); + GetStepCountersSubgroup()->GetCounter("StatIssues", true)->Inc(); } ReportPublicCounters(Builder.PublicStat); UpdateCpuQuota(Builder.CpuUsage); @@ -263,6 +265,7 @@ class TStatusTrackerActor : public TBaseComputeActor { Fq::Private::PingTaskRequest pingTaskRequest = Builder.Build(QueryStats, Issues, ComputeStatus, std::nullopt); if (Builder.Issues) { LOG_W(Builder.Issues.ToOneLineString()); + GetStepCountersSubgroup()->GetCounter("StatIssues", true)->Inc(); } ReportPublicCounters(Builder.PublicStat); UpdateCpuQuota(Builder.CpuUsage); diff --git a/ydb/core/fq/libs/compute/ydb/stopper_actor.cpp b/ydb/core/fq/libs/compute/ydb/stopper_actor.cpp index c876bcd4422d..bfb0427e7a2a 100644 --- a/ydb/core/fq/libs/compute/ydb/stopper_actor.cpp +++ b/ydb/core/fq/libs/compute/ydb/stopper_actor.cpp @@ -136,6 +136,7 @@ class TStopperActor : public TBaseComputeActor { Fq::Private::PingTaskRequest pingTaskRequest = Builder.Build(response.QueryStats, response.Issues, FederatedQuery::QueryMeta::ABORTING_BY_USER, statusCode); if (Builder.Issues) { LOG_W(Builder.Issues.ToOneLineString()); + GetStepCountersSubgroup()->GetCounter("StatIssues", true)->Inc(); } Send(Pinger, new TEvents::TEvForwardPingRequest(pingTaskRequest)); } diff --git a/ydb/public/lib/ydb_cli/common/plan2svg.cpp b/ydb/public/lib/ydb_cli/common/plan2svg.cpp index 23b52ba9dfe2..842c3ca8df12 100644 --- a/ydb/public/lib/ydb_cli/common/plan2svg.cpp +++ b/ydb/public/lib/ydb_cli/common/plan2svg.cpp @@ -451,7 +451,7 @@ void TPlan::LoadStage(std::shared_ptr stage, const NJson::TJsonValue& no TStringBuilder builder; - if (name == "Iterator" || name == "Member") { + if (name == "Iterator" || name == "Member" || name == "ToFlow") { builder << "Reference"; } else { builder << name; @@ -461,11 +461,25 @@ void TPlan::LoadStage(std::shared_ptr stage, const NJson::TJsonValue& no if (auto* limitNode = subNode.GetValueByPath("Limit")) { builder << ": " << limitNode->GetStringSafe(); } + } else if (name == "Sort") { + if (auto* sortByNode = subNode.GetValueByPath("SortBy")) { + auto sortBy = sortByNode->GetStringSafe(); + while (true) { + auto p = sortBy.find("row."); + if (p == sortBy.npos) { + break; + } + sortBy.erase(p, 4); + } + if (sortBy) { + builder << " by " << sortBy; + } + } } else if (name == "Filter") { if (auto* predicateNode = subNode.GetValueByPath("Predicate")) { auto filter = predicateNode->GetStringSafe(); prevFilter = filter; - while(true) { + while (true) { auto p = filter.find("item."); if (p == filter.npos) { break; @@ -482,14 +496,110 @@ void TPlan::LoadStage(std::shared_ptr stage, const NJson::TJsonValue& no } builder << ": " << filter; } - } else if (name == "TopSort") { + } else if (name == "Aggregate") { + if (auto* aggregationNode = subNode.GetValueByPath("Aggregation")) { + auto aggr = aggregationNode->GetStringSafe(); + if (aggr) { + if (aggr.StartsWith("{")) { + aggr.erase(aggr.begin()); + } + if (aggr.EndsWith("}")) { + aggr.erase(aggr.end() - 1); + } + while (true) { + auto p = aggr.find("_yql_agg_"); + if (p == aggr.npos) { + break; + } + auto l = 9; + auto p1 = aggr.begin() + p + l; + while (p1 != aggr.end() && *p1 >= '0' && *p1 <= '9') { + p1++; + l++; + } + auto yqlAgg = aggr.substr(p, l); + if (p1 != aggr.end() && *p1 == ':') { + p1++; + l++; + if (p1 != aggr.end() && *p1 == ' ') { + p1++; + l++; + } + } + aggr.erase(p, l); + + auto extraChars = 7; + p = aggr.find(",state." + yqlAgg); + if (p == aggr.npos) { + p = aggr.find("state." + yqlAgg + ","); + } + if (p == aggr.npos) { + p = aggr.find("state." + yqlAgg); + extraChars = 6; + } + if (p != aggr.npos) { + aggr.erase(p, yqlAgg.size() + extraChars); + } + } + while (true) { + auto p = aggr.find("item."); + if (p == aggr.npos) { + break; + } + aggr.erase(p, 5); + } + builder << " " << aggr; + } + } + if (auto* groupByNode = subNode.GetValueByPath("GroupBy")) { + auto groupBy = groupByNode->GetStringSafe(); + while (true) { + auto p = groupBy.find("item."); + if (p == groupBy.npos) { + break; + } + groupBy.erase(p, 5); + } + if (groupBy) { + builder << ", Group By: " << groupBy; + } + } + } else if (name == "TableFullScan") { + if (auto* tableNode = subNode.GetValueByPath("Table")) { + auto table = tableNode->GetStringSafe(); + auto n = table.find_last_of('/'); + if (n != table.npos) { + table = table.substr(n + 1); + } + builder << " " << table; + } + builder << "("; + if (auto* readColumnsNode = subNode.GetValueByPath("ReadColumns")) { + bool firstColumn = true; + for (const auto& subNode : readColumnsNode->GetArray()) { + if (firstColumn) { + firstColumn = false; + } else { + builder << ", "; + } + builder << subNode.GetStringSafe(); + } + } + builder << ")"; + } else if (name == "TopSort" || name == "Top") { if (auto* limitNode = subNode.GetValueByPath("Limit")) { - builder << ", Limit: " << limitNode->GetStringSafe(); + auto limit = limitNode->GetStringSafe(); + if (limit) { + builder << ", Limit: " << limit; + } } if (auto* topSortByNode = subNode.GetValueByPath("TopSortBy")) { - builder << ", TopSortBy: " << topSortByNode->GetStringSafe(); + auto topSortBy = topSortByNode->GetStringSafe(); + if (topSortBy) { + builder << ", TopSortBy: " << topSortBy; + } } - } else if (name == "Iterator" || name == "Member") { + } else if (name == "Iterator" || name == "Member" || name == "ToFlow") { if (auto* referenceNode = subNode.GetValueByPath(name)) { auto referenceName = referenceNode->GetStringSafe(); references.insert(referenceName); @@ -640,6 +750,7 @@ void TPlan::LoadStage(std::shared_ptr stage, const NJson::TJsonValue& no } if (planNodeType == "Connection") { auto* keyColumnsNode = plan.GetValueByPath("KeyColumns"); + auto* sortColumnsNode = plan.GetValueByPath("SortColumns"); if (auto* subNode = plan.GetValueByPath("Plans")) { for (auto& plan : subNode->GetArray()) { TString nodeType; @@ -659,6 +770,11 @@ void TPlan::LoadStage(std::shared_ptr stage, const NJson::TJsonValue& no stage->Connections.back()->KeyColumns.push_back(keyColumn.GetStringSafe()); } } + if (sortColumnsNode) { + for (auto& sortColumn : sortColumnsNode->GetArray()) { + stage->Connections.back()->SortColumns.push_back(sortColumn.GetStringSafe()); + } + } if (auto* planNodeIdNode = plan.GetValueByPath("PlanNodeId")) { auto planNodeId = planNodeIdNode->GetStringRobust(); @@ -758,8 +874,9 @@ void TPlan::LoadSource(std::shared_ptr source, const NJson::TJsonValue& builder << " " << sourceTypeNode->GetStringSafe(); } if (auto* nameNode = subNode.GetValueByPath("Name")) { - builder << " " << nameNode->GetStringSafe() << "("; + builder << " " << nameNode->GetStringSafe(); } + builder << "("; if (auto* readColumnsNode = subNode.GetValueByPath("ReadColumns")) { bool firstColumn = true; for (const auto& subNode : readColumnsNode->GetArray()) { @@ -1038,8 +1155,9 @@ void TPlan::PrintSvg(ui64 maxTime, ui32& offsetY, TStringBuilder& background, TS if (!s->Info.empty()) { for (auto text : s->Info) { canvas + << "" << text << "" << "" << text << "" << Endl; + << "' y='" << y0 << "'>" << text << "" << "" << Endl; y0 += (INTERNAL_TEXT_HEIGHT + INTERNAL_GAP_Y); } } else { @@ -1295,6 +1413,18 @@ void TPlan::PrintSvg(ui64 maxTime, ui32& offsetY, TStringBuilder& background, TS canvas << k; } } + if (!c->SortColumns.empty()) { + canvas << " SortColumns: "; + bool first = true; + for (auto s : c->SortColumns) { + if (first) { + first = false; + } else { + canvas << ", "; + } + canvas << s; + } + } canvas << "" << Endl << " ") { - columnType = "Optional"; - } - - if (columnType.StartsWith("Optional")) { - str << "IF(" << columnName << " IS NOT NULL, Unwrap(CAST(" << columnName << " as " << columnType << ")), NULL)"; - } else { - str << "Unwrap(CAST(" << columnName << " as " << columnType << "))"; - } - str << " as " << columnName << ((i != columnNames.size() - 1) ? "," : ""); - } - str << " FROM Input;\n"; - str << "$filtered = SELECT * FROM $fields " << whereFilter << ";\n"; + str << "$filtered = SELECT * FROM Input " << whereFilter << ";\n"; str << "SELECT " << OffsetFieldName << ", Unwrap(Json::SerializeJson(Yson::From(RemoveMembers(TableRow(), [\"" << OffsetFieldName; str << "\"])))) as data FROM $filtered"; @@ -300,7 +303,7 @@ class TJsonFilter::TImpl { private: THolder> Program; - THolder&, const TVector>&>>> InputConsumer; + THolder&, const TVector&>>> InputConsumer; const TString Sql; }; @@ -315,7 +318,7 @@ TJsonFilter::TJsonFilter( TJsonFilter::~TJsonFilter() { } -void TJsonFilter::Push(const TVector& offsets, const TVector>& values) { +void TJsonFilter::Push(const TVector& offsets, const TVector& values) { Impl->Push(offsets, values); } diff --git a/ydb/core/fq/libs/row_dispatcher/json_filter.h b/ydb/core/fq/libs/row_dispatcher/json_filter.h index f3435763ce3e..c4435bd9bab7 100644 --- a/ydb/core/fq/libs/row_dispatcher/json_filter.h +++ b/ydb/core/fq/libs/row_dispatcher/json_filter.h @@ -1,7 +1,6 @@ #pragma once -#include -#include +#include namespace NFq { @@ -18,7 +17,7 @@ class TJsonFilter { ~TJsonFilter(); - void Push(const TVector& offsets, const TVector>& values); + void Push(const TVector& offsets, const TVector& values); TString GetSql(); private: diff --git a/ydb/core/fq/libs/row_dispatcher/json_parser.cpp b/ydb/core/fq/libs/row_dispatcher/json_parser.cpp index 14c807fc0226..56946ef49a38 100644 --- a/ydb/core/fq/libs/row_dispatcher/json_parser.cpp +++ b/ydb/core/fq/libs/row_dispatcher/json_parser.cpp @@ -2,6 +2,13 @@ #include +#include +#include +#include +#include +#include +#include + #include #include @@ -25,7 +32,7 @@ struct TJsonParserBuffer { } void Reserve(size_t size, size_t numberValues) { - Values.reserve(2 * (size + simdjson::SIMDJSON_PADDING)); + Values.reserve(size + simdjson::SIMDJSON_PADDING); Offsets.reserve(numberValues); } @@ -45,18 +52,10 @@ struct TJsonParserBuffer { } } - std::string_view AddHolder(std::string_view value) { - Y_ENSURE(Values.size() + value.size() <= Values.capacity(), "Requested too large holders"); - const size_t startPos = Values.size(); - Values << value; - return std::string_view(Values).substr(startPos, value.length()); - } - std::pair Finish() { Y_ENSURE(!Finished, "Cannot finish buffer twice"); Finished = true; Values << TString(simdjson::SIMDJSON_PADDING, ' '); - Values.reserve(2 * Values.size()); return {Values.data(), Values.size()}; } @@ -73,6 +72,188 @@ struct TJsonParserBuffer { TStringBuilder Values = {}; }; +class TColumnParser { + using TParser = std::function; + +public: + const std::string Name; + const TString TypeYson; + const NKikimr::NMiniKQL::TType* TypeMkql; + const bool IsOptional = false; + size_t NumberValues = 0; + +public: + TColumnParser(const TString& name, const TString& typeYson, NKikimr::NMiniKQL::TProgramBuilder& programBuilder) + : Name(name) + , TypeYson(typeYson) + , TypeMkql(NYql::NCommon::ParseTypeFromYson(TStringBuf(typeYson), programBuilder, Cerr)) + , IsOptional(TypeMkql->IsOptional()) + , NumberValues(0) + { + try { + Parser = CreateParser(TypeMkql); + } catch (...) { + throw yexception() << "Failed to create parser for column '" << Name << "' with type " << TypeYson << ", description: " << CurrentExceptionMessage(); + } + } + + void ParseJsonValue(simdjson::fallback::ondemand::value jsonValue, NYql::NUdf::TUnboxedValue& resultValue) { + Parser(jsonValue, resultValue); + NumberValues++; + } + + void ValidateNumberValues(size_t expectedNumberValues, ui64 firstOffset) const { + if (Y_UNLIKELY(!IsOptional && NumberValues < expectedNumberValues)) { + throw yexception() << "Failed to parse json messages, found " << expectedNumberValues - NumberValues << " missing values from offset " << firstOffset << " in non optional column '" << Name << "' with type " << TypeYson; + } + } + +private: + TParser CreateParser(const NKikimr::NMiniKQL::TType* type, bool optional = false) const { + switch (type->GetKind()) { + case NKikimr::NMiniKQL::TTypeBase::EKind::Data: { + const auto* dataType = AS_TYPE(NKikimr::NMiniKQL::TDataType, type); + if (const auto dataSlot = dataType->GetDataSlot()) { + return GetJsonValueParser(*dataSlot, optional); + } + throw yexception() << "unsupported data type with id " << dataType->GetSchemeType(); + } + + case NKikimr::NMiniKQL::TTypeBase::EKind::Optional: { + return AddOptional(CreateParser(AS_TYPE(NKikimr::NMiniKQL::TOptionalType, type)->GetItemType(), true)); + } + + default: { + throw yexception() << "unsupported type kind " << type->GetKindAsStr(); + } + } + } + + static TParser AddOptional(TParser parser) { + return [parser](simdjson::fallback::ondemand::value jsonValue, NYql::NUdf::TUnboxedValue& resultValue) { + parser(std::move(jsonValue), resultValue); + if (resultValue) { + resultValue = resultValue.MakeOptional(); + } + }; + } + + static TParser GetJsonValueParser(NYql::NUdf::EDataSlot dataSlot, bool optional) { + const auto& typeInfo = NYql::NUdf::GetDataTypeInfo(dataSlot); + return [dataSlot, optional, &typeInfo](simdjson::fallback::ondemand::value jsonValue, NYql::NUdf::TUnboxedValue& resultValue) { + switch (jsonValue.type()) { + case simdjson::fallback::ondemand::json_type::number: { + try { + switch (dataSlot) { + case NYql::NUdf::EDataSlot::Int8: + resultValue = ParseJsonNumber(jsonValue.get_int64().value()); + break; + case NYql::NUdf::EDataSlot::Int16: + resultValue = ParseJsonNumber(jsonValue.get_int64().value()); + break; + case NYql::NUdf::EDataSlot::Int32: + resultValue = ParseJsonNumber(jsonValue.get_int64().value()); + break; + case NYql::NUdf::EDataSlot::Int64: + resultValue = NYql::NUdf::TUnboxedValuePod(jsonValue.get_int64().value()); + break; + + case NYql::NUdf::EDataSlot::Uint8: + resultValue = ParseJsonNumber(jsonValue.get_uint64().value()); + break; + case NYql::NUdf::EDataSlot::Uint16: + resultValue = ParseJsonNumber(jsonValue.get_uint64().value()); + break; + case NYql::NUdf::EDataSlot::Uint32: + resultValue = ParseJsonNumber(jsonValue.get_uint64().value()); + break; + case NYql::NUdf::EDataSlot::Uint64: + resultValue = NYql::NUdf::TUnboxedValuePod(jsonValue.get_uint64().value()); + break; + + case NYql::NUdf::EDataSlot::Double: + resultValue = NYql::NUdf::TUnboxedValuePod(jsonValue.get_double().value()); + break; + case NYql::NUdf::EDataSlot::Float: + resultValue = NYql::NUdf::TUnboxedValuePod(static_cast(jsonValue.get_double().value())); + break; + + default: + throw yexception() << "number value is not expected for data type " << typeInfo.Name; + } + } catch (...) { + throw yexception() << "failed to parse data type " << typeInfo.Name << " from json number (raw: '" << TruncateString(jsonValue.raw_json_token()) << "'), error: " << CurrentExceptionMessage(); + } + break; + } + + case simdjson::fallback::ondemand::json_type::string: { + const auto rawString = jsonValue.get_string().value(); + resultValue = NKikimr::NMiniKQL::ValueFromString(dataSlot, rawString); + if (Y_UNLIKELY(!resultValue)) { + throw yexception() << "failed to parse data type " << typeInfo.Name << " from json string: '" << TruncateString(rawString) << "'"; + } + LockObject(resultValue); + break; + } + + case simdjson::fallback::ondemand::json_type::array: + case simdjson::fallback::ondemand::json_type::object: { + const auto rawJson = jsonValue.raw_json().value(); + if (Y_UNLIKELY(dataSlot != NYql::NUdf::EDataSlot::Json)) { + throw yexception() << "found unexpected nested value (raw: '" << TruncateString(rawJson) << "'), expected data type " < + static NYql::NUdf::TUnboxedValuePod ParseJsonNumber(TJsonNumber number) { + if (number < std::numeric_limits::min() || std::numeric_limits::max() < number) { + throw yexception() << "number is out of range"; + } + return NYql::NUdf::TUnboxedValuePod(static_cast(number)); + } + + static void LockObject(NYql::NUdf::TUnboxedValue& value) { + const i32 numberRefs = value.LockRef(); + Y_ENSURE(numberRefs == -1 || numberRefs == 1); + } + + static TString TruncateString(std::string_view rawString, size_t maxSize = 1_KB) { + if (rawString.size() <= maxSize) { + return TString(rawString); + } + return TStringBuilder() << rawString.substr(0, maxSize) << " truncated..."; + } + +private: + TParser Parser; +}; + } // anonymous namespace namespace NFq { @@ -80,26 +261,24 @@ namespace NFq { //// TJsonParser class TJsonParser::TImpl { - struct TColumnDescription { - std::string Name; - TString Type; - }; - public: TImpl(const TVector& columns, const TVector& types, ui64 batchSize, TDuration batchCreationTimeout) - : BatchSize(batchSize) + : Alloc(__LOCATION__, NKikimr::TAlignedPagePoolCounters(), true, false) + , TypeEnv(Alloc) + , BatchSize(batchSize) , BatchCreationTimeout(batchCreationTimeout) , ParsedValues(columns.size()) { Y_ENSURE(columns.size() == types.size(), "Number of columns and types should by equal"); - LOG_ROW_DISPATCHER_INFO("Simdjson active implementation " << simdjson::get_active_implementation()->name()); - Columns.reserve(columns.size()); - for (size_t i = 0; i < columns.size(); i++) { - Columns.emplace_back(TColumnDescription{ - .Name = columns[i], - .Type = SkipOptional(types[i]) - }); + with_lock (Alloc) { + auto functonRegistry = NKikimr::NMiniKQL::CreateFunctionRegistry(&PrintBackTrace, NKikimr::NMiniKQL::CreateBuiltinRegistry(), false, {}); + NKikimr::NMiniKQL::TProgramBuilder programBuilder(TypeEnv, *functonRegistry); + + Columns.reserve(columns.size()); + for (size_t i = 0; i < columns.size(); i++) { + Columns.emplace_back(columns[i], types[i], programBuilder); + } } ColumnsIndex.reserve(columns.size()); @@ -108,6 +287,8 @@ class TJsonParser::TImpl { } Buffer.Reserve(BatchSize, 1); + + LOG_ROW_DISPATCHER_INFO("Simdjson active implementation " << simdjson::get_active_implementation()->name()); Parser.threaded = false; } @@ -138,107 +319,90 @@ class TJsonParser::TImpl { Buffer.AddMessages(messages); } - const TVector>& Parse() { + const TVector& Parse() { Y_ENSURE(Buffer.IsReady(), "Nothing to parse"); const auto [values, size] = Buffer.Finish(); LOG_ROW_DISPATCHER_TRACE("Parse values:\n" << values); - for (auto& parsedColumn : ParsedValues) { - parsedColumn.clear(); - parsedColumn.reserve(Buffer.NumberValues); - } - - size_t rowId = 0; - simdjson::ondemand::document_stream documents = Parser.iterate_many(values, size, simdjson::dom::DEFAULT_BATCH_SIZE); - for (auto document : documents) { - for (auto item : document.get_object()) { - const auto it = ColumnsIndex.find(item.escaped_key().value()); - if (it == ColumnsIndex.end()) { - continue; - } - - const auto& column = Columns[it->second]; - - std::string_view value; - if (item.value().is_null()) { - // TODO: support optional types and create UV - continue; - } else if (column.Type == "Json") { - value = item.value().raw_json().value(); - } else if (column.Type == "String" || column.Type == "Utf8") { - value = item.value().get_string().value(); - } else if (item.value().is_scalar()) { - // TODO: perform type validation and create UV - value = item.value().raw_json_token().value(); - } else { - throw yexception() << "Failed to parse json string, expected scalar type for column '" << it->first << "' with type " << column.Type << " but got nested json, please change column type to Json."; + with_lock (Alloc) { + ClearColumns(Buffer.NumberValues); + + size_t rowId = 0; + simdjson::ondemand::document_stream documents = Parser.iterate_many(values, size, simdjson::ondemand::DEFAULT_BATCH_SIZE); + for (auto document : documents) { + for (auto item : document.get_object()) { + const auto it = ColumnsIndex.find(item.escaped_key().value()); + if (it == ColumnsIndex.end()) { + continue; + } + + const size_t columnId = it->second; + auto& columnParser = Columns[columnId]; + try { + columnParser.ParseJsonValue(item.value(), ParsedValues[columnId][rowId]); + } catch (...) { + throw yexception() << "Failed to parse json string at offset " << Buffer.Offsets[rowId] << ", got parsing error for column '" << columnParser.Name << "' with type " << columnParser.TypeYson << ", description: " << CurrentExceptionMessage(); + } } + rowId++; + } - auto& parsedColumn = ParsedValues[it->second]; - parsedColumn.resize(rowId); - parsedColumn.emplace_back(CreateHolderIfNeeded(values, size, value)); + const ui64 firstOffset = Buffer.Offsets.front(); + if (rowId != Buffer.NumberValues) { + throw yexception() << "Failed to parse json messages, expected " << Buffer.NumberValues << " json rows from offset " << firstOffset << " but got " << rowId; + } + for (const auto& columnDesc : Columns) { + columnDesc.ValidateNumberValues(rowId, firstOffset); } - rowId++; } - Y_ENSURE(rowId == Buffer.NumberValues, "Unexpected number of json documents"); - for (auto& parsedColumn : ParsedValues) { - parsedColumn.resize(Buffer.NumberValues); - } return ParsedValues; } TString GetDescription() const { TStringBuilder description = TStringBuilder() << "Columns: "; for (const auto& column : Columns) { - description << "'" << column.Name << "':" << column.Type << " "; + description << "'" << column.Name << "':" << column.TypeYson << " "; } description << "\nNumber values in buffer: " << Buffer.NumberValues << ", buffer size: " << Buffer.GetSize() << ", finished: " << Buffer.Finished; return description; } - TString GetDebugString(const TVector>& parsedValues) const { - TStringBuilder result; - for (size_t i = 0; i < Columns.size(); ++i) { - result << "Parsed column '" << Columns[i].Name << "': "; - for (const auto& value : parsedValues[i]) { - result << "'" << value << "' "; - } - result << "\n"; - } - return result; + ~TImpl() { + Alloc.Acquire(); + ClearColumns(0); } private: - std::string_view CreateHolderIfNeeded(const char* dataHolder, size_t size, std::string_view value) { - ptrdiff_t diff = value.data() - dataHolder; - if (0 <= diff && static_cast(diff) < size) { - return value; - } - return Buffer.AddHolder(value); - } + void ClearColumns(size_t newSize) { + const auto clearValue = [&allocState = Alloc.Ref()](NYql::NUdf::TUnboxedValue& value){ + value.UnlockRef(1); + value.Clear(); + }; - static TString SkipOptional(const TString& type) { - if (type.StartsWith("Optional")) { - TStringBuf optionalType = type; - Y_ENSURE(optionalType.SkipPrefix("Optional<"), "Unexpected type"); - Y_ENSURE(optionalType.ChopSuffix(">"), "Unexpected type"); - return TString(optionalType); + for (size_t i = 0; i < Columns.size(); ++i) { + Columns[i].NumberValues = 0; + + auto& parsedColumn = ParsedValues[i]; + std::for_each(parsedColumn.begin(), parsedColumn.end(), clearValue); + parsedColumn.resize(newSize); } - return type; } private: + NKikimr::NMiniKQL::TScopedAlloc Alloc; + NKikimr::NMiniKQL::TTypeEnvironment TypeEnv; + const ui64 BatchSize; const TDuration BatchCreationTimeout; - TVector Columns; + TVector Columns; absl::flat_hash_map ColumnsIndex; TJsonParserBuffer Buffer; simdjson::ondemand::parser Parser; - TVector> ParsedValues; + TVector>> ParsedValues; }; TJsonParser::TJsonParser(const TVector& columns, const TVector& types, ui64 batchSize, TDuration batchCreationTimeout) @@ -268,7 +432,7 @@ const TVector& TJsonParser::GetOffsets() const { return Impl->GetOffsets(); } -const TVector>& TJsonParser::Parse() { +const TVector& TJsonParser::Parse() { return Impl->Parse(); } @@ -276,10 +440,6 @@ TString TJsonParser::GetDescription() const { return Impl->GetDescription(); } -TString TJsonParser::GetDebugString(const TVector>& parsedValues) const { - return Impl->GetDebugString(parsedValues); -} - std::unique_ptr NewJsonParser(const TVector& columns, const TVector& types, ui64 batchSize, TDuration batchCreationTimeout) { return std::unique_ptr(new TJsonParser(columns, types, batchSize, batchCreationTimeout)); } diff --git a/ydb/core/fq/libs/row_dispatcher/json_parser.h b/ydb/core/fq/libs/row_dispatcher/json_parser.h index 4f5f2b14e3a2..878fa534f015 100644 --- a/ydb/core/fq/libs/row_dispatcher/json_parser.h +++ b/ydb/core/fq/libs/row_dispatcher/json_parser.h @@ -1,5 +1,7 @@ #pragma once +#include + #include namespace NFq { @@ -15,10 +17,9 @@ class TJsonParser { const TVector& GetOffsets() const; void AddMessages(const TVector& messages); - const TVector>& Parse(); + const TVector& Parse(); TString GetDescription() const; - TString GetDebugString(const TVector>& parsedValues) const; private: class TImpl; diff --git a/ydb/core/fq/libs/row_dispatcher/topic_session.cpp b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp index fb54e82540cd..557451c36aed 100644 --- a/ydb/core/fq/libs/row_dispatcher/topic_session.cpp +++ b/ydb/core/fq/libs/row_dispatcher/topic_session.cpp @@ -189,7 +189,7 @@ class TTopicSession : public TActorBootstrapped { void SubscribeOnNextEvent(); void SendToParsing(const TVector& messages); void DoParsing(bool force = false); - void DoFiltering(const TVector& offsets, const TVector>& parsedValues); + void DoFiltering(const TVector& offsets, const TVector& parsedValues); void SendData(ClientsInfo& info); void UpdateParser(); void FatalError(const TString& message, const std::unique_ptr* filter = nullptr); @@ -215,7 +215,7 @@ class TTopicSession : public TActorBootstrapped { void SendStatistic(); void SendSessionError(NActors::TActorId readActorId, const TString& message); - TVector> RebuildJson(const ClientsInfo& info, const TVector>& parsedValues); + TVector RebuildJson(const ClientsInfo& info, const TVector& parsedValues); void UpdateParserSchema(const TParserInputType& inputType); void UpdateFieldsIds(ClientsInfo& clientInfo); @@ -387,15 +387,15 @@ void TTopicSession::Handle(NFq::TEvPrivate::TEvCreateSession::TPtr&) { CreateTopicSession(); } -TVector> TTopicSession::RebuildJson(const ClientsInfo& info, const TVector>& parsedValues) { - TVector> result; +TVector TTopicSession::RebuildJson(const ClientsInfo& info, const TVector& parsedValues) { + TVector result; const auto& offsets = ParserSchema.FieldsMap; result.reserve(info.FieldsIds.size()); for (auto fieldId : info.FieldsIds) { Y_ENSURE(fieldId < offsets.size(), "fieldId " << fieldId << ", offsets.size() " << offsets.size()); auto offset = offsets[fieldId]; Y_ENSURE(offset < parsedValues.size(), "offset " << offset << ", jsonBatch.size() " << parsedValues.size()); - result.push_back(parsedValues[offset]); + result.push_back(&parsedValues[offset]); } return result; } @@ -584,9 +584,9 @@ void TTopicSession::DoParsing(bool force) { } } -void TTopicSession::DoFiltering(const TVector& offsets, const TVector>& parsedValues) { +void TTopicSession::DoFiltering(const TVector& offsets, const TVector& parsedValues) { Y_ENSURE(parsedValues, "Expected non empty schema"); - LOG_ROW_DISPATCHER_TRACE("SendToFiltering, first offset: " << offsets.front() << ", last offset: " << offsets.back() << ", data:\n" << Parser->GetDebugString(parsedValues)); + LOG_ROW_DISPATCHER_TRACE("SendToFiltering, first offset: " << offsets.front() << ", last offset: " << offsets.back()); for (auto& [actorId, info] : Clients) { try { diff --git a/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp index 1e2befea3778..fecc26e4a73c 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/json_filter_ut.cpp @@ -8,6 +8,9 @@ #include #include #include + +#include + #include namespace { @@ -19,7 +22,9 @@ class TFixture : public NUnitTest::TBaseFixture { public: TFixture() - : Runtime(true) {} + : Runtime(true) + , Alloc(__LOCATION__, NKikimr::TAlignedPagePoolCounters(), true, false) + {} void SetUp(NUnitTest::TTestContext&) override { TAutoPtr app = new TAppPrepare(); @@ -30,6 +35,14 @@ class TFixture : public NUnitTest::TBaseFixture { } void TearDown(NUnitTest::TTestContext& /* context */) override { + with_lock (Alloc) { + for (const auto& holder : Holders) { + for (const auto& value : holder) { + Alloc.Ref().UnlockObject(value); + } + } + Holders.clear(); + } Filter.reset(); } @@ -45,9 +58,44 @@ class TFixture : public NUnitTest::TBaseFixture { callback); } + const NKikimr::NMiniKQL::TUnboxedValueVector* MakeVector(size_t size, std::function valueCreator) { + with_lock (Alloc) { + Holders.emplace_front(); + for (size_t i = 0; i < size; ++i) { + Holders.front().emplace_back(valueCreator(i)); + Alloc.Ref().LockObject(Holders.front().back()); + } + return &Holders.front(); + } + } + + template + const NKikimr::NMiniKQL::TUnboxedValueVector* MakeVector(const TVector& values, bool optional = false) { + return MakeVector(values.size(), [&](size_t i) { + NYql::NUdf::TUnboxedValuePod unboxedValue = NYql::NUdf::TUnboxedValuePod(values[i]); + return optional ? unboxedValue.MakeOptional() : unboxedValue; + }); + } + + const NKikimr::NMiniKQL::TUnboxedValueVector* MakeStringVector(const TVector& values, bool optional = false) { + return MakeVector(values.size(), [&](size_t i) { + NYql::NUdf::TUnboxedValuePod stringValue = NKikimr::NMiniKQL::MakeString(values[i]); + return optional ? stringValue.MakeOptional() : stringValue; + }); + } + + const NKikimr::NMiniKQL::TUnboxedValueVector* MakeEmptyVector(size_t size) { + return MakeVector(size, [&](size_t) { + return NYql::NUdf::TUnboxedValuePod(); + }); + } + TActorSystemStub actorSystemStub; NActors::TTestActorRuntime Runtime; std::unique_ptr Filter; + + NKikimr::NMiniKQL::TScopedAlloc Alloc; + TList Holders; }; Y_UNIT_TEST_SUITE(TJsonFilterTests) { @@ -55,13 +103,13 @@ Y_UNIT_TEST_SUITE(TJsonFilterTests) { TMap result; MakeFilter( {"a1", "a2", "a@3"}, - {"String", "UInt64", "Optional"}, + {"[DataType; String]", "[DataType; Uint64]", "[OptionalType; [DataType; String]]"}, "where a2 > 100", [&](ui64 offset, const TString& json) { result[offset] = json; }); - Filter->Push({5}, {{"hello1"}, {"99"}, {"zapuskaem"}}); - Filter->Push({6}, {{"hello2"}, {"101"}, {"gusya"}}); + Filter->Push({5}, {MakeStringVector({"hello1"}), MakeVector({99}), MakeStringVector({"zapuskaem"}, true)}); + Filter->Push({6}, {MakeStringVector({"hello2"}), MakeVector({101}), MakeStringVector({"gusya"}, true)}); UNIT_ASSERT_VALUES_EQUAL(1, result.size()); UNIT_ASSERT_VALUES_EQUAL(R"({"a1":"hello2","a2":101,"a@3":"gusya"})", result[6]); } @@ -70,55 +118,46 @@ Y_UNIT_TEST_SUITE(TJsonFilterTests) { TMap result; MakeFilter( {"a2", "a1"}, - {"UInt64", "String"}, + {"[DataType; Uint64]", "[DataType; String]"}, "where a2 > 100", [&](ui64 offset, const TString& json) { result[offset] = json; }); - Filter->Push({5}, {{"99"}, {"hello1"}}); - Filter->Push({6}, {{"101"}, {"hello2"}}); + Filter->Push({5}, {MakeVector({99}), MakeStringVector({"hello1"})}); + Filter->Push({6}, {MakeVector({101}), MakeStringVector({"hello2"})}); UNIT_ASSERT_VALUES_EQUAL(1, result.size()); UNIT_ASSERT_VALUES_EQUAL(R"({"a1":"hello2","a2":101})", result[6]); - UNIT_ASSERT_EXCEPTION_CONTAINS(Filter->Push({7}, {{"102"}, {std::string_view()}}), yexception, "Failed to unwrap empty optional"); - UNIT_ASSERT_EXCEPTION_CONTAINS(Filter->Push({8}, {{"str"}, {"hello3"}}), yexception, "Failed to unwrap empty optional"); } Y_UNIT_TEST_F(ManyValues, TFixture) { TMap result; MakeFilter( - {"a1", "a2"}, - {"String", "UInt64"}, + {"a1", "a2", "a3"}, + {"[DataType; String]", "[DataType; Uint64]", "[DataType; String]"}, "where a2 > 100", [&](ui64 offset, const TString& json) { result[offset] = json; }); - Filter->Push({5, 6}, {{"hello1", "hello2"}, {"99", "101"}}); - UNIT_ASSERT_VALUES_EQUAL(1, result.size()); - UNIT_ASSERT_VALUES_EQUAL(R"({"a1":"hello2","a2":101})", result[6]); + const TString largeString = "abcdefghjkl1234567890+abcdefghjkl1234567890"; + for (ui64 i = 0; i < 5; ++i) { + Filter->Push({2 * i, 2 * i + 1}, {MakeStringVector({"hello1", "hello2"}), MakeVector({99, 101}), MakeStringVector({largeString, largeString})}); + UNIT_ASSERT_VALUES_EQUAL_C(i + 1, result.size(), i); + UNIT_ASSERT_VALUES_EQUAL_C(TStringBuilder() << "{\"a1\":\"hello2\",\"a2\":101,\"a3\":\"" << largeString << "\"}", result[2 * i + 1], i); + } } Y_UNIT_TEST_F(NullValues, TFixture) { TMap result; MakeFilter( {"a1", "a2"}, - {"Optional", "String"}, + {"[OptionalType; [DataType; Uint64]]", "[DataType; String]"}, "where a1 is null", [&](ui64 offset, const TString& json) { result[offset] = json; }); - Filter->Push({5}, {{std::string_view()}, {"str"}}); + Filter->Push({5}, {MakeEmptyVector(1), MakeStringVector({"str"})}); UNIT_ASSERT_VALUES_EQUAL(1, result.size()); UNIT_ASSERT_VALUES_EQUAL(R"({"a1":null,"a2":"str"})", result[5]); - UNIT_ASSERT_EXCEPTION_CONTAINS(Filter->Push({5}, {{"hello1"}, {"str"}}), yexception, "Failed to unwrap empty optional"); - } - - Y_UNIT_TEST_F(ThrowExceptionByError, TFixture) { - MakeFilter( - {"a1", "a2"}, - {"String", "UInt64"}, - "where Unwrap(a2) = 1", - [&](ui64, const TString&) { }); - UNIT_ASSERT_EXCEPTION_CONTAINS(Filter->Push({5}, {{"99"}, {"hello1"}}), yexception, "Failed to unwrap empty optional"); } } diff --git a/ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp index 28242a1ebc74..bdf74270094d 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/json_parser_ut.cpp @@ -40,23 +40,15 @@ class TFixture : public NUnitTest::TBaseFixture { } void MakeParser(TVector columns) { - MakeParser(columns, TVector(columns.size(), "String")); + MakeParser(columns, TVector(columns.size(), "[DataType; String]")); } - void PushToParser(ui64 offset, const TString& data) { + const TVector& PushToParser(ui64 offset, const TString& data) { Parser->AddMessages({GetMessage(offset, data)}); - ParsedValues = Parser->Parse(); - ResultNumberValues = ParsedValues ? ParsedValues.front().size() : 0; - } - - TVector GetParsedRow(size_t id) const { - TVector result; - result.reserve(ParsedValues.size()); - for (const auto& columnResult : ParsedValues) { - result.emplace_back(columnResult[id]); - } - return result; + const auto& parsedValues = Parser->Parse(); + ResultNumberValues = parsedValues ? parsedValues.front().size() : 0; + return parsedValues; } static NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage GetMessage(ui64 offset, const TString& data) { @@ -67,54 +59,67 @@ class TFixture : public NUnitTest::TBaseFixture { TActorSystemStub actorSystemStub; NActors::TTestActorRuntime Runtime; std::unique_ptr Parser; - ui64 ResultNumberValues = 0; - TVector> ParsedValues; }; Y_UNIT_TEST_SUITE(TJsonParserTests) { Y_UNIT_TEST_F(Simple1, TFixture) { - MakeParser({"a1", "a2"}, {"String", "Optional"}); - PushToParser(42,R"({"a1": "hello1", "a2": 101, "event": "event1"})"); + MakeParser({"a1", "a2"}, {"[DataType; String]", "[OptionalType; [DataType; Uint64]]"}); + const auto& result = PushToParser(42,R"({"a1": "hello1", "a2": 101, "event": "event1"})"); UNIT_ASSERT_VALUES_EQUAL(1, ResultNumberValues); - const auto& result = GetParsedRow(0); UNIT_ASSERT_VALUES_EQUAL(2, result.size()); - UNIT_ASSERT_VALUES_EQUAL("hello1", result.front()); - UNIT_ASSERT_VALUES_EQUAL("101", result.back()); + UNIT_ASSERT_VALUES_EQUAL("hello1", TString(result[0][0].AsStringRef())); + UNIT_ASSERT_VALUES_EQUAL(101, result[1][0].GetOptionalValue().Get()); } Y_UNIT_TEST_F(Simple2, TFixture) { MakeParser({"a2", "a1"}); - PushToParser(42,R"({"a1": "hello1", "a2": "101", "event": "event1"})"); + const auto& result = PushToParser(42,R"({"a1": "hello1", "a2": "101", "event": "event1"})"); UNIT_ASSERT_VALUES_EQUAL(1, ResultNumberValues); - const auto& result = GetParsedRow(0); UNIT_ASSERT_VALUES_EQUAL(2, result.size()); - UNIT_ASSERT_VALUES_EQUAL("101", result.front()); - UNIT_ASSERT_VALUES_EQUAL("hello1", result.back()); + UNIT_ASSERT_VALUES_EQUAL("101", TString(result[0][0].AsStringRef())); + UNIT_ASSERT_VALUES_EQUAL("hello1", TString(result[1][0].AsStringRef())); } Y_UNIT_TEST_F(Simple3, TFixture) { MakeParser({"a1", "a2"}); - PushToParser(42,R"({"a2": "hello1", "a1": "101", "event": "event1"})"); + const auto& result = PushToParser(42,R"({"a2": "hello1", "a1": "101", "event": "event1"})"); UNIT_ASSERT_VALUES_EQUAL(1, ResultNumberValues); - const auto& result = GetParsedRow(0); UNIT_ASSERT_VALUES_EQUAL(2, result.size()); - UNIT_ASSERT_VALUES_EQUAL("101", result.front()); - UNIT_ASSERT_VALUES_EQUAL("hello1", result.back()); + UNIT_ASSERT_VALUES_EQUAL("101", TString(result[0][0].AsStringRef())); + UNIT_ASSERT_VALUES_EQUAL("hello1", TString(result[1][0].AsStringRef())); } Y_UNIT_TEST_F(Simple4, TFixture) { MakeParser({"a2", "a1"}); - PushToParser(42, R"({"a2": "hello1", "a1": "101", "event": "event1"})"); + const auto& result = PushToParser(42, R"({"a2": "hello1", "a1": "101", "event": "event1"})"); UNIT_ASSERT_VALUES_EQUAL(1, ResultNumberValues); - const auto& result = GetParsedRow(0); UNIT_ASSERT_VALUES_EQUAL(2, result.size()); - UNIT_ASSERT_VALUES_EQUAL("hello1", result.front()); - UNIT_ASSERT_VALUES_EQUAL("101", result.back()); + UNIT_ASSERT_VALUES_EQUAL("hello1", TString(result[0][0].AsStringRef())); + UNIT_ASSERT_VALUES_EQUAL("101", TString(result[1][0].AsStringRef())); + } + + Y_UNIT_TEST_F(LargeStrings, TFixture) { + MakeParser({"col"}); + + const TString largeString = "abcdefghjkl1234567890+abcdefghjkl1234567890"; + const TString jsonString = TStringBuilder() << "{\"col\": \"" << largeString << "\"}"; + Parser->AddMessages({ + GetMessage(42, jsonString), + GetMessage(43, jsonString) + }); + + const auto& result = Parser->Parse(); + ResultNumberValues = result.front().size(); + UNIT_ASSERT_VALUES_EQUAL(2, ResultNumberValues); + + UNIT_ASSERT_VALUES_EQUAL(1, result.size()); + UNIT_ASSERT_VALUES_EQUAL(largeString, TString(result[0][0].AsStringRef())); + UNIT_ASSERT_VALUES_EQUAL(largeString, TString(result[0][1].AsStringRef())); } Y_UNIT_TEST_F(ManyValues, TFixture) { @@ -126,73 +131,116 @@ Y_UNIT_TEST_SUITE(TJsonParserTests) { GetMessage(44, R"({"a2": "101", "a1": "hello1", "event": "event3"})") }); - ParsedValues = Parser->Parse(); - ResultNumberValues = ParsedValues.front().size(); + const auto& result = Parser->Parse(); + ResultNumberValues = result.front().size(); UNIT_ASSERT_VALUES_EQUAL(3, ResultNumberValues); + UNIT_ASSERT_VALUES_EQUAL(2, result.size()); for (size_t i = 0; i < ResultNumberValues; ++i) { - const auto& result = GetParsedRow(i); - UNIT_ASSERT_VALUES_EQUAL_C(2, result.size(), i); - UNIT_ASSERT_VALUES_EQUAL_C("hello1", result.front(), i); - UNIT_ASSERT_VALUES_EQUAL_C("101", result.back(), i); + UNIT_ASSERT_VALUES_EQUAL_C("hello1", TString(result[0][i].AsStringRef()), i); + UNIT_ASSERT_VALUES_EQUAL_C("101", TString(result[1][i].AsStringRef()), i); } } Y_UNIT_TEST_F(MissingFields, TFixture) { - MakeParser({"a1", "a2"}); + MakeParser({"a1", "a2"}, {"[OptionalType; [DataType; String]]", "[OptionalType; [DataType; Uint64]]"}); Parser->AddMessages({ - GetMessage(42, R"({"a1": "hello1", "a2": "101", "event": "event1"})"), + GetMessage(42, R"({"a1": "hello1", "a2": 101 , "event": "event1"})"), GetMessage(43, R"({"a1": "hello1", "event": "event2"})"), GetMessage(44, R"({"a2": "101", "a1": null, "event": "event3"})") }); - ParsedValues = Parser->Parse(); - ResultNumberValues = ParsedValues.front().size(); + const auto& result = Parser->Parse(); + ResultNumberValues = result.front().size(); UNIT_ASSERT_VALUES_EQUAL(3, ResultNumberValues); + UNIT_ASSERT_VALUES_EQUAL(2, result.size()); for (size_t i = 0; i < ResultNumberValues; ++i) { - const auto& result = GetParsedRow(i); - UNIT_ASSERT_VALUES_EQUAL_C(2, result.size(), i); - UNIT_ASSERT_VALUES_EQUAL_C(i != 2 ? "hello1" : "", result.front(), i); - UNIT_ASSERT_VALUES_EQUAL_C(i != 1 ? "101" : "", result.back(), i); + if (i == 2) { + UNIT_ASSERT_C(!result[0][i], i); + } else { + NYql::NUdf::TUnboxedValue value = result[0][i].GetOptionalValue(); + UNIT_ASSERT_VALUES_EQUAL_C("hello1", TString(value.AsStringRef()), i); + } + if (i == 1) { + UNIT_ASSERT_C(!result[1][i], i); + } else { + UNIT_ASSERT_VALUES_EQUAL_C(101, result[1][i].GetOptionalValue().Get(), i); + } } } Y_UNIT_TEST_F(NestedTypes, TFixture) { - MakeParser({"nested", "a1"}, {"Optional", "String"}); + MakeParser({"nested", "a1"}, {"[OptionalType; [DataType; Json]]", "[DataType; String]"}); Parser->AddMessages({ GetMessage(42, R"({"a1": "hello1", "nested": {"key": "value"}})"), - GetMessage(43, R"({"a1": "hello1", "nested": ["key1", "key2"]})") + GetMessage(43, R"({"a1": "hello2", "nested": ["key1", "key2"]})") }); - ParsedValues = Parser->Parse(); - ResultNumberValues = ParsedValues.front().size(); + const auto& result = Parser->Parse(); + ResultNumberValues = result.front().size(); UNIT_ASSERT_VALUES_EQUAL(2, ResultNumberValues); - const auto& nestedJson = GetParsedRow(0); - UNIT_ASSERT_VALUES_EQUAL(2, nestedJson.size()); - UNIT_ASSERT_VALUES_EQUAL("{\"key\": \"value\"}", nestedJson.front()); - UNIT_ASSERT_VALUES_EQUAL("hello1", nestedJson.back()); + UNIT_ASSERT_VALUES_EQUAL(2, result.size()); + UNIT_ASSERT_VALUES_EQUAL("{\"key\": \"value\"}", TString(result[0][0].AsStringRef())); + UNIT_ASSERT_VALUES_EQUAL("hello1", TString(result[1][0].AsStringRef())); + + UNIT_ASSERT_VALUES_EQUAL("[\"key1\", \"key2\"]", TString(result[0][1].AsStringRef())); + UNIT_ASSERT_VALUES_EQUAL("hello2", TString(result[1][1].AsStringRef())); + } + + Y_UNIT_TEST_F(SimpleBooleans, TFixture) { + MakeParser({"a"}, {"[DataType; Bool]"}); + Parser->AddMessages({ + GetMessage(42, R"({"a": true})"), + GetMessage(43, R"({"a": false})") + }); + + const auto& result = Parser->Parse(); + ResultNumberValues = result.front().size(); + UNIT_ASSERT_VALUES_EQUAL(2, ResultNumberValues); - const auto& nestedList = GetParsedRow(1); - UNIT_ASSERT_VALUES_EQUAL(2, nestedList.size()); - UNIT_ASSERT_VALUES_EQUAL("[\"key1\", \"key2\"]", nestedList.front()); - UNIT_ASSERT_VALUES_EQUAL("hello1", nestedList.back()); + UNIT_ASSERT_VALUES_EQUAL(1, result.size()); + UNIT_ASSERT_VALUES_EQUAL(true, result[0][0].Get()); + UNIT_ASSERT_VALUES_EQUAL(false, result[0][1].Get()); } - Y_UNIT_TEST_F(StringTypeValidation, TFixture) { - MakeParser({"a1"}, {"String"}); - UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a1": 1234})"), simdjson::simdjson_error, "INCORRECT_TYPE: The JSON element does not have the requested type."); + Y_UNIT_TEST_F(MissingFieldsValidation, TFixture) { + MakeParser({"a1", "a2"}, {"[DataType; String]", "[DataType; Uint64]"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a1": "hello1", "a2": null, "event": "event1"})"), yexception, "Failed to parse json string at offset 42, got parsing error for column 'a2' with type [DataType; Uint64], description: (yexception) found unexpected null value, expected non optional data type Uint64"); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a2": 105, "event": "event1"})"), yexception, "Failed to parse json messages, found 1 missing values from offset 42 in non optional column 'a1' with type [DataType; String]"); } - Y_UNIT_TEST_F(JsonTypeValidation, TFixture) { - MakeParser({"a1"}, {"Int32"}); - UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a1": {"key": "value"}})"), yexception, "Failed to parse json string, expected scalar type for column 'a1' with type Int32 but got nested json, please change column type to Json."); + Y_UNIT_TEST_F(TypeKindsValidation, TFixture) { + UNIT_ASSERT_EXCEPTION_CONTAINS( + MakeParser({"a2", "a1"}, {"[OptionalType; [DataType; String]]", "[ListType; [DataType; String]]"}), + yexception, + "Failed to create parser for column 'a1' with type [ListType; [DataType; String]], description: (yexception) unsupported type kind List" + ); + } + + Y_UNIT_TEST_F(NumbersValidation, TFixture) { + MakeParser({"a1", "a2"}, {"[OptionalType; [DataType; String]]", "[DataType; Uint8]"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a1": 456, "a2": 42})"), yexception, "Failed to parse json string at offset 42, got parsing error for column 'a1' with type [OptionalType; [DataType; String]], description: (yexception) failed to parse data type String from json number (raw: '456'), error: (yexception) number value is not expected for data type String"); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a1": "456", "a2": -42})"), yexception, "Failed to parse json string at offset 42, got parsing error for column 'a2' with type [DataType; Uint8], description: (yexception) failed to parse data type Uint8 from json number (raw: '-42'), error: (simdjson::simdjson_error) INCORRECT_TYPE: The JSON element does not have the requested type."); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a1": "str", "a2": 99999})"), yexception, "Failed to parse json string at offset 42, got parsing error for column 'a2' with type [DataType; Uint8], description: (yexception) failed to parse data type Uint8 from json number (raw: '99999'), error: (yexception) number is out of range"); + } + + Y_UNIT_TEST_F(NestedJsonValidation, TFixture) { + MakeParser({"a1", "a2"}, {"[OptionalType; [DataType; Json]]", "[OptionalType; [DataType; String]]"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a1": {"key": "value"}, "a2": {"key2": "value2"}})"), yexception, "Failed to parse json string at offset 42, got parsing error for column 'a2' with type [OptionalType; [DataType; String]], description: (yexception) found unexpected nested value (raw: '{\"key2\": \"value2\"}'), expected data type String, please use Json type for nested values"); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a1": {"key" "value"}, "a2": "str"})"), yexception, "Failed to parse json string at offset 42, got parsing error for column 'a1' with type [OptionalType; [DataType; Json]], description: (simdjson::simdjson_error) TAPE_ERROR: The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc."); + } + + Y_UNIT_TEST_F(BoolsValidation, TFixture) { + MakeParser({"a1", "a2"}, {"[OptionalType; [DataType; String]]", "[DataType; Bool]"}); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a1": true, "a2": false})"), yexception, "Failed to parse json string at offset 42, got parsing error for column 'a1' with type [OptionalType; [DataType; String]], description: (yexception) found unexpected bool value, expected data type String"); } Y_UNIT_TEST_F(ThrowExceptionByError, TFixture) { - MakeParser({"a2", "a1"}); + MakeParser({"a"}); UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"(ydb)"), simdjson::simdjson_error, "INCORRECT_TYPE: The JSON element does not have the requested type."); + UNIT_ASSERT_EXCEPTION_CONTAINS(PushToParser(42, R"({"a": "value1"} {"a": "value2"})"), yexception, "Failed to parse json messages, expected 1 json rows from offset 42 but got 2"); } } diff --git a/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp index 75b4ab84e648..e52f18d9a2e8 100644 --- a/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp +++ b/ydb/core/fq/libs/row_dispatcher/ut/topic_session_ut.cpp @@ -100,8 +100,8 @@ class TFixture : public NUnitTest::TBaseFixture { settings.SetDatabase(GetDefaultPqDatabase()); settings.AddColumns("dt"); settings.AddColumns("value"); - settings.AddColumnTypes("Uint64"); - settings.AddColumnTypes("String"); + settings.AddColumnTypes("[DataType; Uint64]"); + settings.AddColumnTypes("[DataType; String]"); if (!emptyPredicate) { settings.SetPredicate("WHERE true"); } @@ -387,7 +387,7 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { auto source1 = BuildSource(topicName); auto source2 = BuildSource(topicName); source2.AddColumns("field1"); - source2.AddColumnTypes("String"); + source2.AddColumnTypes("[DataType; String]"); StartSession(ReadActorId1, source1); StartSession(ReadActorId2, source2); @@ -395,7 +395,6 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { TString json1 = "{\"dt\":101,\"value\":\"value1\", \"field1\":\"field1\"}"; TString json2 = "{\"dt\":102,\"value\":\"value2\", \"field1\":\"field2\"}"; - Sleep(TDuration::Seconds(3)); PQWrite({ json1, json2 }, topicName); ExpectNewDataArrived({ReadActorId1, ReadActorId2}); ExpectMessageBatch(ReadActorId1, { "{\"dt\":101,\"value\":\"value1\"}", "{\"dt\":102,\"value\":\"value2\"}" }); @@ -403,7 +402,7 @@ Y_UNIT_TEST_SUITE(TopicSessionTests) { auto source3 = BuildSource(topicName); source3.AddColumns("field2"); - source3.AddColumnTypes("String"); + source3.AddColumnTypes("[DataType; String]"); auto readActorId3 = Runtime.AllocateEdgeActor(); StartSession(readActorId3, source3); diff --git a/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp b/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp index 530bda256dc0..f643c08e5876 100644 --- a/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp +++ b/ydb/library/yql/providers/pq/provider/yql_pq_dq_integration.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -257,7 +258,7 @@ class TPqDqIntegration: public TDqIntegrationBase { const auto rowSchema = topic.RowSpec().Ref().GetTypeAnn()->Cast()->GetType()->Cast(); for (const auto& item : rowSchema->GetItems()) { srcDesc.AddColumns(TString(item->GetName())); - srcDesc.AddColumnTypes(FormatType(item->GetItemType())); + srcDesc.AddColumnTypes(NCommon::WriteTypeToYson(item->GetItemType(), NYT::NYson::EYsonFormat::Text)); } NYql::NConnector::NApi::TPredicate predicateProto; diff --git a/ydb/tests/fq/yds/test_row_dispatcher.py b/ydb/tests/fq/yds/test_row_dispatcher.py index a2b9bdab9fa7..b8730297ac8c 100644 --- a/ydb/tests/fq/yds/test_row_dispatcher.py +++ b/ydb/tests/fq/yds/test_row_dispatcher.py @@ -233,15 +233,17 @@ def test_nested_types(self, kikimr, client): query_id = start_yds_query(kikimr, client, sql) wait_actor_count(kikimr, "FQ_ROW_DISPATCHER_SESSION", 1) + large_string = "abcdefghjkl1234567890+abcdefghjkl1234567890" data = [ - '{"time": 101, "data": {"key": "value"}, "event": "event1"}', - '{"time": 102, "data": ["key1", "key2"], "event": "event2"}', + '{"time": 101, "data": {"key": "value", "second_key":"' + large_string + '"}, "event": "event1"}', + '{"time": 102, "data": ["key1", "key2", "' + large_string + '"], "event": "event2"}', + '{"time": 103, "data": ["' + large_string + '"], "event": "event3"}', ] self.write_stream(data) expected = [ - '{"key": "value"}', - '["key1", "key2"]' + '{"key": "value", "second_key":"' + large_string + '"}', + '["key1", "key2", "' + large_string + '"]' ] assert self.read_stream(len(expected), topic_path=self.output_topic) == expected