a

kardymonds · kardymonds · commit a3a2bc87a1d8 · 2024-09-29T19:40:07.000Z
diff --git a/ydb/library/yql/providers/pq/async_io/dq_pq_write_actor.cpp b/ydb/library/yql/providers/pq/async_io/dq_pq_write_actor.cpp
@@ -152,8 +152,8 @@ class TDqPqWriteActor : public NActors::TActor<TDqPqWriteActor>, public IDqCompu
         , LogPrefix(TStringBuilder() << "SelfId: " << this->SelfId() << ", TxId: " << TxId << ", TaskId: " << taskId << ", PQ sink. ")
         , FreeSpace(freeSpace)
         , TopicClient(Driver, GetTopicClientSettings())
-        , File(std::get<TString>(TxId), EOpenModeFlag::CreateAlways |  EOpenModeFlag::WrOnly)
-    {
+        , File("data/result/" + std::get<TString>(TxId), EOpenModeFlag::CreateAlways |  EOpenModeFlag::WrOnly)
+    { 
         EgressStats.Level = statsLevel;
     }
 
diff --git a/ydb/library/yql/tools/dqrun/dqrun.cpp b/ydb/library/yql/tools/dqrun/dqrun.cpp
@@ -99,7 +99,6 @@
 #include <library/cpp/digest/md5/md5.h>
 #include <ydb/library/actors/http/http_proxy.h>
 
-#include <util/folder/iterator.h>
 #include <util/generic/string.h>
 #include <util/generic/hash.h>
 #include <util/generic/scope.h>
@@ -1133,11 +1132,11 @@ int RunMain(int argc, const char* argv[])
     );
 
     TVector<TProgramPtr> programs;
-    for (const auto& entry : TDirIterator(progFiles)) {
-        if (entry.fts_type != FTS_F) {
-            continue;
+    for (int i = 0;; ++i) {
+        auto progFile = TString("data/query/" + std::to_string(i) + ".txt");
+        if (!NFs::Exists(progFile)) {
+            break;
         }
-        const auto& progFile = entry.fts_path;
 
     TProgramPtr program;
     if (res.Has("replay") && res.Has("capture")) {
diff --git a/ydb/library/yql/tools/dqrun/gen.py b/ydb/library/yql/tools/dqrun/gen.py
@@ -7,20 +7,31 @@
 import random
 from typing import List
 
-
 OUTPUT_DIR = "/home/kardymon-d/ydb3/ydb/ydb/library/yql/tools/dqrun/data"
-QUERIES_NUM = 2
+QUERIES_NUM = 200
 TABLE_NAME = "pq.`match`"
-ROW_NUM = 1000
-UINT64_COLUMN_NUM = 3#750
-STR32_COLUMN_NUM = 0#375
-STR64_COLUMN_NUM = 0#375
+ROW_NUM = 550000
+UINT64_COLUMN_NUM = 20
+STR32_COLUMN_NUM = 10
+STR64_COLUMN_NUM = 10
 COLUMN_NUM = UINT64_COLUMN_NUM + STR32_COLUMN_NUM + STR64_COLUMN_NUM
-FILLED_COLUMN_NUM = 2
-MAX_VALUE = 2
+FILLED_COLUMN_NUM = 40
 Row = List
 
 
+
+# OUTPUT_DIR = "/home/kardymon-d/ydb3/ydb/ydb/library/yql/tools/dqrun/data"
+# QUERIES_NUM = 1
+# TABLE_NAME = "pq.`match`"
+# ROW_NUM = 600000
+# UINT64_COLUMN_NUM = 3
+# STR32_COLUMN_NUM = 1
+# STR64_COLUMN_NUM = 1
+# COLUMN_NUM = UINT64_COLUMN_NUM + STR32_COLUMN_NUM + STR64_COLUMN_NUM
+# FILLED_COLUMN_NUM = 5
+# Row = List
+
+
 def get_timer() -> int:
     get_timer.timer += 1
     return get_timer.timer
@@ -36,8 +47,8 @@ class Table:
 
 
 def validate() -> None:
-    assert(0 < FILLED_COLUMN_NUM < COLUMN_NUM)
-    assert(0 < MAX_VALUE < 2 ** 64)
+    assert(QUERIES_NUM < ROW_NUM)
+    assert(0 < FILLED_COLUMN_NUM <= COLUMN_NUM)
 
 
 def gen_column_names(column_num: int) -> List[str]:
@@ -64,31 +75,29 @@ def type_to_sql(column_type: str) -> str:
 def int_to_str(n: int, length: int) -> str:
     result = ""
     while n > 0:
-        div, mod = divmod(n, 10)
+        div, mod = divmod(n, 26)
         result += chr(ord('a') + mod)
         n = div
-    result += '_' * (length - len(result))
+    result += 'a' * (length - len(result))
     return result
 
 
-def gen_value(column_type: str):
-    value = random.randint(0, MAX_VALUE - 1)
+def gen_cell(row_index: int, column_type: str):
+    value = 2 * row_index
     if column_type == "uint64":
         return value
     elif column_type == "str32":
         return int_to_str(value, 32)
-        # return "".join(random.choices(string.ascii_lowercase, k=32))
     elif column_type == "str64":
-        return int_to_str(value, 64)
-        # return "".join(random.choices(string.ascii_lowercase, k=64))
+        return int_to_str(value, 100)
     else:
         raise RuntimeError()
 
 
-def gen_row(column_types: List[str]) -> Row:
+def gen_row(row_index: int, column_types: List[str]) -> Row:
     are_filled = [i < FILLED_COLUMN_NUM for i in range(len(column_types))]
     random.shuffle(are_filled)
-    return [gen_value(column_type) if is_filled else None for is_filled, column_type in zip(are_filled, column_types)]
+    return [gen_cell(row_index, column_type) if is_filled else None for is_filled, column_type in zip(are_filled, column_types)]
 
 
 def gen_table(name: str, column_names: List[str], column_types: List[str], rows: List[Row]) -> Table:
@@ -108,18 +117,41 @@ def write_table(filename: str, table: Table) -> None:
             file.write('\n')
 
 
-def gen_query(table: Table, column_index: int, value) -> str:
-    return f"""SELECT *
+def gen_value(query_index: int, table: Table) -> str:
+    index = query_index % len(table.column_types)
+    column_type = table.column_types[index]
+    
+    l = len(table.rows)
+    print(f"ffff {l}")
+    value = table.rows[l - 2][index]
+    print(f"    value {value} type {column_type}")
+    #return value
+    
+    #value = query_index % (2 * ROW_NUM)
+    if column_type == "uint64":
+        return f"{value}UL"
+    elif column_type == "str32":
+        return f'"{value}"'
+    elif column_type == "str64":
+        return f'"{value}"'
+    else:
+        raise RuntimeError()
+
+
+def gen_query(query_index: int, table: Table, value: str) -> str:
+    return f"""$match = SELECT *
 FROM {table.name}
 WITH (
     FORMAT=json_each_row,
     SCHEMA
     (
-        {",".join([name + " " + type_to_sql(type) + "?" for name, type in zip(table.column_names, table.column_types)])}
+        {",".join([name + " " + type_to_sql(type) for name, type in zip(table.column_names, table.column_types)])}
     )
 )
-WHERE {table.column_names[column_index]} IS NOT NULL AND {table.column_names[column_index]} == {value}
-LIMIT 20;
+WHERE {table.column_names[query_index % len(table.column_names)]} == {value};
+
+INSERT INTO pq.`match`
+SELECT ToBytes(Unwrap(Yson::SerializeJson(Yson::From(TableRow())))) FROM $match;
 """
 
 
@@ -133,15 +165,14 @@ def main():
 
     column_names = gen_column_names(COLUMN_NUM)
     column_types = gen_column_types(COLUMN_NUM)
-    rows = [gen_row(column_types) for _ in range(ROW_NUM)]
+    rows = [gen_row(row_index, column_types) for row_index in range(ROW_NUM)]
     table = gen_table(TABLE_NAME, column_names, column_types, rows)
     write_table(f"{OUTPUT_DIR}/data.txt", table)
 
     pathlib.Path(f"{OUTPUT_DIR}/query/").mkdir(parents=True, exist_ok=True)
-    for i in range(QUERIES_NUM):
-        column_index = i % COLUMN_NUM
-        value = gen_value(column_types[column_index])
-        write_query(f"{OUTPUT_DIR}/query/{i}.txt", gen_query(table, column_index, value))
+    for query_index in range(QUERIES_NUM):
+        value = gen_value(query_index, table)
+        write_query(f"{OUTPUT_DIR}/query/{query_index}.txt", gen_query(query_index, table, value))
 
 
 if __name__ == "__main__":