6
6
7
7
8
8
OUTPUT_DIR = "/home/vokayndzop/ydb/ydb/library/yql/tools/dqrun/data/"
9
- QUERIES_NUM = 2
9
+ QUERIES_NUM = 100
10
10
TABLE_NAME = "pq.`match`"
11
- ROW_NUM = 100
12
- UINT64_COLUMN_NUM = 3 #750
13
- STR32_COLUMN_NUM = 0 #375
14
- STR64_COLUMN_NUM = 0 #375
11
+ ROW_NUM = 1000
12
+ UINT64_COLUMN_NUM = 10
13
+ STR32_COLUMN_NUM = 5
14
+ STR64_COLUMN_NUM = 5
15
15
COLUMN_NUM = UINT64_COLUMN_NUM + STR32_COLUMN_NUM + STR64_COLUMN_NUM
16
- FILLED_COLUMN_NUM = 2
17
- MAX_VALUE = 2
16
+ FILLED_COLUMN_NUM = 20
18
17
Row = List
19
18
20
19
@@ -33,8 +32,8 @@ class Table:
33
32
34
33
35
34
def validate () -> None :
36
- assert (0 < FILLED_COLUMN_NUM < COLUMN_NUM )
37
- assert (0 < MAX_VALUE < 2 ** 64 )
35
+ assert (QUERIES_NUM < ROW_NUM )
36
+ assert (0 < FILLED_COLUMN_NUM <= COLUMN_NUM )
38
37
39
38
40
39
def gen_column_names (column_num : int ) -> List [str ]:
@@ -61,31 +60,29 @@ def type_to_sql(column_type: str) -> str:
61
60
def int_to_str (n : int , length : int ) -> str :
62
61
result = ""
63
62
while n > 0 :
64
- div , mod = divmod (n , 10 )
63
+ div , mod = divmod (n , 26 )
65
64
result += chr (ord ('a' ) + mod )
66
65
n = div
67
- result += '_ ' * (length - len (result ))
66
+ result += 'a ' * (length - len (result ))
68
67
return result
69
68
70
69
71
- def gen_value ( column_type : str ):
72
- value = random . randint ( 0 , MAX_VALUE - 1 )
70
+ def gen_cell ( row_index : int , column_type : str ):
71
+ value = 2 * row_index
73
72
if column_type == "uint64" :
74
73
return value
75
74
elif column_type == "str32" :
76
75
return int_to_str (value , 32 )
77
- # return "".join(random.choices(string.ascii_lowercase, k=32))
78
76
elif column_type == "str64" :
79
77
return int_to_str (value , 64 )
80
- # return "".join(random.choices(string.ascii_lowercase, k=64))
81
78
else :
82
79
raise RuntimeError ()
83
80
84
81
85
- def gen_row (column_types : List [str ]) -> Row :
82
+ def gen_row (row_index : int , column_types : List [str ]) -> Row :
86
83
are_filled = [i < FILLED_COLUMN_NUM for i in range (len (column_types ))]
87
84
random .shuffle (are_filled )
88
- return [gen_value ( column_type ) if is_filled else None for is_filled , column_type in zip (are_filled , column_types )]
85
+ return [gen_cell ( row_index , column_type ) if is_filled else None for is_filled , column_type in zip (are_filled , column_types )]
89
86
90
87
91
88
def gen_table (name : str , column_names : List [str ], column_types : List [str ], rows : List [Row ]) -> Table :
@@ -105,18 +102,33 @@ def write_table(filename: str, table: Table) -> None:
105
102
file .write ('\n ' )
106
103
107
104
108
- def gen_query (table : Table , column_index : int , value ) -> str :
109
- return f"""SELECT *
105
+ def gen_value (query_index : int , table : Table ) -> str :
106
+ column_type = table .column_types [query_index % len (table .column_types )]
107
+ value = query_index % (2 * ROW_NUM )
108
+ if column_type == "uint64" :
109
+ return f"{ value } UL"
110
+ elif column_type == "str32" :
111
+ return f'"{ int_to_str (value , 32 )} "'
112
+ elif column_type == "str64" :
113
+ return f'"{ int_to_str (value , 64 )} "'
114
+ else :
115
+ raise RuntimeError ()
116
+
117
+
118
+ def gen_query (query_index : int , table : Table , value : str ) -> str :
119
+ return f"""$match = SELECT *
110
120
FROM { table .name }
111
121
WITH (
112
122
FORMAT=json_each_row,
113
123
SCHEMA
114
124
(
115
- { "," .join ([name + " " + type_to_sql (type ) + "?" for name , type in zip (table .column_names , table .column_types )])}
125
+ { "," .join ([name + " " + type_to_sql (type ) for name , type in zip (table .column_names , table .column_types )])}
116
126
)
117
127
)
118
- WHERE { table .column_names [column_index ]} IS NOT NULL AND { table .column_names [column_index ]} == { value }
119
- LIMIT 20;
128
+ WHERE { table .column_names [query_index % len (table .column_names )]} == { value } ;
129
+
130
+ INSERT INTO pq.`match`
131
+ SELECT ToBytes(Unwrap(Yson::SerializeJson(Yson::From(TableRow())))) FROM $match;
120
132
"""
121
133
122
134
@@ -130,15 +142,14 @@ def main():
130
142
131
143
column_names = gen_column_names (COLUMN_NUM )
132
144
column_types = gen_column_types (COLUMN_NUM )
133
- rows = [gen_row (column_types ) for _ in range (ROW_NUM )]
145
+ rows = [gen_row (row_index , column_types ) for row_index in range (ROW_NUM )]
134
146
table = gen_table (TABLE_NAME , column_names , column_types , rows )
135
147
write_table (f"{ OUTPUT_DIR } /data.txt" , table )
136
148
137
149
pathlib .Path (f"{ OUTPUT_DIR } /query/" ).mkdir (parents = True , exist_ok = True )
138
- for i in range (QUERIES_NUM ):
139
- column_index = i % COLUMN_NUM
140
- value = gen_value (column_types [column_index ])
141
- write_query (f"{ OUTPUT_DIR } /query/{ i } .txt" , gen_query (table , column_index , value ))
150
+ for query_index in range (QUERIES_NUM ):
151
+ value = gen_value (query_index , table )
152
+ write_query (f"{ OUTPUT_DIR } /query/{ query_index } .txt" , gen_query (query_index , table , value ))
142
153
143
154
144
155
if __name__ == "__main__" :
0 commit comments