7
7
import random
8
8
from typing import List
9
9
10
-
11
10
OUTPUT_DIR = "/home/kardymon-d/ydb3/ydb/ydb/library/yql/tools/dqrun/data"
12
- QUERIES_NUM = 2
11
+ QUERIES_NUM = 200
13
12
TABLE_NAME = "pq.`match`"
14
- ROW_NUM = 1000
15
- UINT64_COLUMN_NUM = 3 #750
16
- STR32_COLUMN_NUM = 0 #375
17
- STR64_COLUMN_NUM = 0 #375
13
+ ROW_NUM = 550000
14
+ UINT64_COLUMN_NUM = 20
15
+ STR32_COLUMN_NUM = 10
16
+ STR64_COLUMN_NUM = 10
18
17
COLUMN_NUM = UINT64_COLUMN_NUM + STR32_COLUMN_NUM + STR64_COLUMN_NUM
19
- FILLED_COLUMN_NUM = 2
20
- MAX_VALUE = 2
18
+ FILLED_COLUMN_NUM = 40
21
19
Row = List
22
20
23
21
22
+
23
+ # OUTPUT_DIR = "/home/kardymon-d/ydb3/ydb/ydb/library/yql/tools/dqrun/data"
24
+ # QUERIES_NUM = 1
25
+ # TABLE_NAME = "pq.`match`"
26
+ # ROW_NUM = 600000
27
+ # UINT64_COLUMN_NUM = 3
28
+ # STR32_COLUMN_NUM = 1
29
+ # STR64_COLUMN_NUM = 1
30
+ # COLUMN_NUM = UINT64_COLUMN_NUM + STR32_COLUMN_NUM + STR64_COLUMN_NUM
31
+ # FILLED_COLUMN_NUM = 5
32
+ # Row = List
33
+
34
+
24
35
def get_timer () -> int :
25
36
get_timer .timer += 1
26
37
return get_timer .timer
@@ -36,8 +47,8 @@ class Table:
36
47
37
48
38
49
def validate () -> None :
39
- assert (0 < FILLED_COLUMN_NUM < COLUMN_NUM )
40
- assert (0 < MAX_VALUE < 2 ** 64 )
50
+ assert (QUERIES_NUM < ROW_NUM )
51
+ assert (0 < FILLED_COLUMN_NUM <= COLUMN_NUM )
41
52
42
53
43
54
def gen_column_names (column_num : int ) -> List [str ]:
@@ -64,31 +75,29 @@ def type_to_sql(column_type: str) -> str:
64
75
def int_to_str (n : int , length : int ) -> str :
65
76
result = ""
66
77
while n > 0 :
67
- div , mod = divmod (n , 10 )
78
+ div , mod = divmod (n , 26 )
68
79
result += chr (ord ('a' ) + mod )
69
80
n = div
70
- result += '_ ' * (length - len (result ))
81
+ result += 'a ' * (length - len (result ))
71
82
return result
72
83
73
84
74
- def gen_value ( column_type : str ):
75
- value = random . randint ( 0 , MAX_VALUE - 1 )
85
+ def gen_cell ( row_index : int , column_type : str ):
86
+ value = 2 * row_index
76
87
if column_type == "uint64" :
77
88
return value
78
89
elif column_type == "str32" :
79
90
return int_to_str (value , 32 )
80
- # return "".join(random.choices(string.ascii_lowercase, k=32))
81
91
elif column_type == "str64" :
82
- return int_to_str (value , 64 )
83
- # return "".join(random.choices(string.ascii_lowercase, k=64))
92
+ return int_to_str (value , 100 )
84
93
else :
85
94
raise RuntimeError ()
86
95
87
96
88
- def gen_row (column_types : List [str ]) -> Row :
97
+ def gen_row (row_index : int , column_types : List [str ]) -> Row :
89
98
are_filled = [i < FILLED_COLUMN_NUM for i in range (len (column_types ))]
90
99
random .shuffle (are_filled )
91
- return [gen_value ( column_type ) if is_filled else None for is_filled , column_type in zip (are_filled , column_types )]
100
+ return [gen_cell ( row_index , column_type ) if is_filled else None for is_filled , column_type in zip (are_filled , column_types )]
92
101
93
102
94
103
def gen_table (name : str , column_names : List [str ], column_types : List [str ], rows : List [Row ]) -> Table :
@@ -108,18 +117,41 @@ def write_table(filename: str, table: Table) -> None:
108
117
file .write ('\n ' )
109
118
110
119
111
- def gen_query (table : Table , column_index : int , value ) -> str :
112
- return f"""SELECT *
120
+ def gen_value (query_index : int , table : Table ) -> str :
121
+ index = query_index % len (table .column_types )
122
+ column_type = table .column_types [index ]
123
+
124
+ l = len (table .rows )
125
+ print (f"ffff { l } " )
126
+ value = table .rows [l - 2 ][index ]
127
+ print (f" value { value } type { column_type } " )
128
+ #return value
129
+
130
+ #value = query_index % (2 * ROW_NUM)
131
+ if column_type == "uint64" :
132
+ return f"{ value } UL"
133
+ elif column_type == "str32" :
134
+ return f'"{ value } "'
135
+ elif column_type == "str64" :
136
+ return f'"{ value } "'
137
+ else :
138
+ raise RuntimeError ()
139
+
140
+
141
+ def gen_query (query_index : int , table : Table , value : str ) -> str :
142
+ return f"""$match = SELECT *
113
143
FROM { table .name }
114
144
WITH (
115
145
FORMAT=json_each_row,
116
146
SCHEMA
117
147
(
118
- { "," .join ([name + " " + type_to_sql (type ) + "?" for name , type in zip (table .column_names , table .column_types )])}
148
+ { "," .join ([name + " " + type_to_sql (type ) for name , type in zip (table .column_names , table .column_types )])}
119
149
)
120
150
)
121
- WHERE { table .column_names [column_index ]} IS NOT NULL AND { table .column_names [column_index ]} == { value }
122
- LIMIT 20;
151
+ WHERE { table .column_names [query_index % len (table .column_names )]} == { value } ;
152
+
153
+ INSERT INTO pq.`match`
154
+ SELECT ToBytes(Unwrap(Yson::SerializeJson(Yson::From(TableRow())))) FROM $match;
123
155
"""
124
156
125
157
@@ -133,15 +165,14 @@ def main():
133
165
134
166
column_names = gen_column_names (COLUMN_NUM )
135
167
column_types = gen_column_types (COLUMN_NUM )
136
- rows = [gen_row (column_types ) for _ in range (ROW_NUM )]
168
+ rows = [gen_row (row_index , column_types ) for row_index in range (ROW_NUM )]
137
169
table = gen_table (TABLE_NAME , column_names , column_types , rows )
138
170
write_table (f"{ OUTPUT_DIR } /data.txt" , table )
139
171
140
172
pathlib .Path (f"{ OUTPUT_DIR } /query/" ).mkdir (parents = True , exist_ok = True )
141
- for i in range (QUERIES_NUM ):
142
- column_index = i % COLUMN_NUM
143
- value = gen_value (column_types [column_index ])
144
- write_query (f"{ OUTPUT_DIR } /query/{ i } .txt" , gen_query (table , column_index , value ))
173
+ for query_index in range (QUERIES_NUM ):
174
+ value = gen_value (query_index , table )
175
+ write_query (f"{ OUTPUT_DIR } /query/{ query_index } .txt" , gen_query (query_index , table , value ))
145
176
146
177
147
178
if __name__ == "__main__" :
0 commit comments