16
16
from process_whole_archive_option import ProcessWholeArchiveOption
17
17
18
18
19
- CUDA_LIBRARIES = {
20
- '-lcublas_static' : '-lcublas' ,
21
- '-lcublasLt_static' : '-lcublasLt' ,
22
- '-lcudart_static' : '-lcudart' ,
23
- '-lcudnn_static' : '-lcudnn' ,
24
- '-lcudnn_adv_infer_static' : '-lcudnn' ,
25
- '-lcudnn_adv_train_static' : '-lcudnn' ,
26
- '-lcudnn_cnn_infer_static' : '-lcudnn' ,
27
- '-lcudnn_cnn_train_static' : '-lcudnn' ,
28
- '-lcudnn_ops_infer_static' : '-lcudnn' ,
29
- '-lcudnn_ops_train_static' : '-lcudnn' ,
30
- '-lcufft_static_nocallback' : '-lcufft' ,
31
- '-lcupti_static' : '-lcupti' ,
32
- '-lcurand_static' : '-lcurand' ,
33
- '-lcusolver_static' : '-lcusolver' ,
34
- '-lcusparse_static' : '-lcusparse' ,
35
- '-lmyelin_compiler_static' : '-lmyelin' ,
36
- '-lmyelin_executor_static' : '-lnvcaffe_parser' ,
37
- '-lmyelin_pattern_library_static' : '' ,
38
- '-lmyelin_pattern_runtime_static' : '' ,
39
- '-lnvinfer_static' : '-lnvinfer' ,
40
- '-lnvinfer_plugin_static' : '-lnvinfer_plugin' ,
41
- '-lnvonnxparser_static' : '-lnvonnxparser' ,
42
- '-lnvparsers_static' : '-lnvparsers' ,
43
- '-lnvrtc_static' : '-lnvrtc' ,
44
- '-lnvrtc-builtins_static' : '-lnvrtc-builtins' ,
45
- '-lnvptxcompiler_static' : '' ,
46
- '-lnppc_static' : '-lnppc' ,
47
- '-lnppial_static' : '-lnppial' ,
48
- '-lnppicc_static' : '-lnppicc' ,
49
- '-lnppicom_static' : '-lnppicom' ,
50
- '-lnppidei_static' : '-lnppidei' ,
51
- '-lnppif_static' : '-lnppif' ,
52
- '-lnppig_static' : '-lnppig' ,
53
- '-lnppim_static' : '-lnppim' ,
54
- '-lnppist_static' : '-lnppist' ,
55
- '-lnppisu_static' : '-lnppisu' ,
56
- '-lnppitc_static' : '-lnppitc' ,
57
- '-lnpps_static' : '-lnpps' ,
58
- }
59
-
60
-
61
- class CUDAManager :
62
- def __init__ (self , known_arches , nvprune_exe ):
63
- self .fatbin_libs = self ._known_fatbin_libs (set (CUDA_LIBRARIES ))
64
-
65
- self .prune_args = []
66
- if known_arches :
67
- for arch in known_arches .split (':' ):
68
- self .prune_args .append ('-gencode' )
69
- self .prune_args .append (self ._arch_flag (arch ))
70
-
71
- self .nvprune_exe = nvprune_exe
72
-
73
- def has_cuda_fatbins (self , cmd ):
74
- return bool (set (cmd ) & self .fatbin_libs )
75
-
76
- @property
77
- def can_prune_libs (self ):
78
- return self .prune_args and self .nvprune_exe
79
-
80
- def _known_fatbin_libs (self , libs ):
81
- libs_wo_device_code = {
82
- '-lcudart_static' ,
83
- '-lcupti_static' ,
84
- '-lnppc_static' ,
85
- }
86
- return set (libs ) - libs_wo_device_code
87
-
88
- def _arch_flag (self , arch ):
89
- _ , ver = arch .split ('_' , 1 )
90
- return 'arch=compute_{},code={}' .format (ver , arch )
91
-
92
- def prune_lib (self , inp_fname , out_fname ):
93
- if self .prune_args :
94
- prune_command = [self .nvprune_exe ] + self .prune_args + ['--output-file' , out_fname , inp_fname ]
95
- subprocess .check_call (prune_command )
96
-
97
- def write_linker_script (self , f ):
98
- # This script simply says:
99
- # * Place all `.nv_fatbin` input sections from all input files into one `.nv_fatbin` output section of output file
100
- # * Place it after `.bss` section
101
- #
102
- # Motivation can be found here: https://maskray.me/blog/2021-07-04-sections-and-overwrite-sections#insert-before-and-insert-after
103
- # TL;DR - we put section with a lot of GPU code directly after the last meaningful section in the binary
104
- # (which turns out to be .bss)
105
- # In that case, we decrease chances of relocation overflows from .text to .bss,
106
- # because now these sections are close to each other
107
- script = textwrap .dedent ("""
108
- SECTIONS {
109
- .nv_fatbin : { *(.nv_fatbin) }
110
- } INSERT AFTER .bss
111
- """ ).strip ()
112
-
113
- f .write (script )
114
-
115
-
116
- def tmpdir_generator (base_path , prefix ):
117
- for idx in itertools .count ():
118
- path = os .path .abspath (os .path .join (base_path , prefix + '_' + str (idx )))
119
- os .makedirs (path )
120
- yield path
121
-
122
-
123
- def process_cuda_library_by_external_tool (cmd , build_root , tool_name , callable_tool_executor , allowed_cuda_libs ):
124
- tmpdir_gen = tmpdir_generator (build_root , 'cuda_' + tool_name + '_libs' )
125
-
126
- new_flags = []
127
- cuda_deps = set ()
128
-
129
- # Because each directory flag only affects flags that follow it,
130
- # for correct pruning we need to process that in reversed order
131
- for flag in reversed (cmd ):
132
- if flag in allowed_cuda_libs :
133
- cuda_deps .add ('lib' + flag [2 :] + '.a' )
134
- flag += '_' + tool_name
135
- elif flag .startswith ('-L' ) and os .path .exists (flag [2 :]) and os .path .isdir (flag [2 :]) and any (f in cuda_deps for f in os .listdir (flag [2 :])):
136
- from_dirpath = flag [2 :]
137
- from_deps = list (cuda_deps & set (os .listdir (from_dirpath )))
138
-
139
- if from_deps :
140
- to_dirpath = next (tmpdir_gen )
141
-
142
- for f in from_deps :
143
- from_path = os .path .join (from_dirpath , f )
144
- to_path = os .path .join (to_dirpath , f [:- 2 ] + '_' + tool_name + '.a' )
145
- callable_tool_executor (from_path , to_path )
146
- cuda_deps .remove (f )
147
-
148
- # do not remove current directory
149
- # because it can contain other libraries we want link to
150
- # instead we just add new directory with processed by tool libs
151
- new_flags .append ('-L' + to_dirpath )
152
-
153
- new_flags .append (flag )
154
-
155
- assert not cuda_deps , ('Unresolved CUDA deps: ' + ',' .join (cuda_deps ))
156
- return reversed (new_flags )
157
-
158
-
159
- def process_cuda_libraries_by_objcopy (cmd , build_root , objcopy_exe ):
160
- if not objcopy_exe :
161
- return cmd
162
-
163
- def run_objcopy (from_path , to_path ):
164
- rename_section_command = [objcopy_exe , "--rename-section" , ".ctors=.init_array" , from_path , to_path ]
165
- subprocess .check_call (rename_section_command )
166
-
167
- possible_libraries = set (CUDA_LIBRARIES .keys ())
168
- possible_libraries .update ([
169
- '-lcudadevrt' ,
170
- '-lcufilt' ,
171
- '-lculibos' ,
172
- ])
173
- possible_libraries .update ([
174
- lib_name + "_pruner" for lib_name in possible_libraries
175
- ])
176
-
177
- return process_cuda_library_by_external_tool (list (cmd ), build_root , 'objcopy' , run_objcopy , possible_libraries )
178
-
179
-
180
- def process_cuda_libraries_by_nvprune (cmd , cuda_manager , build_root ):
181
- if not cuda_manager .has_cuda_fatbins (cmd ):
182
- return cmd
183
-
184
- # add custom linker script
185
- to_dirpath = next (tmpdir_generator (build_root , 'cuda_linker_script' ))
186
- script_path = os .path .join (to_dirpath , 'script' )
187
- with open (script_path , 'w' ) as f :
188
- cuda_manager .write_linker_script (f )
189
- flags_with_linker = list (cmd ) + ['-Wl,--script={}' .format (script_path )]
190
-
191
- if not cuda_manager .can_prune_libs :
192
- return flags_with_linker
193
-
194
- return process_cuda_library_by_external_tool (flags_with_linker , build_root , 'pruner' , cuda_manager .prune_lib , cuda_manager .fatbin_libs )
195
-
196
-
197
19
def remove_excessive_flags (cmd ):
198
20
flags = []
199
21
for flag in cmd :
@@ -202,16 +24,6 @@ def remove_excessive_flags(cmd):
202
24
return flags
203
25
204
26
205
- def fix_cmd_for_dynamic_cuda (cmd ):
206
- flags = []
207
- for flag in cmd :
208
- if flag in CUDA_LIBRARIES :
209
- flags .append (CUDA_LIBRARIES [flag ])
210
- else :
211
- flags .append (flag )
212
- return flags
213
-
214
-
215
27
def remove_libs (cmd , libs ):
216
28
excluded_flags = ['-l{}' .format (lib ) for lib in libs ]
217
29
@@ -270,13 +82,6 @@ def parse_args(args):
270
82
cmd = args
271
83
cmd = remove_excessive_flags (cmd )
272
84
273
- if opts .dynamic_cuda :
274
- cmd = fix_cmd_for_dynamic_cuda (cmd )
275
- else :
276
- cuda_manager = CUDAManager (opts .cuda_architectures , opts .nvprune_exe )
277
- cmd = process_cuda_libraries_by_nvprune (cmd , cuda_manager , opts .build_root )
278
- cmd = process_cuda_libraries_by_objcopy (cmd , opts .build_root , opts .objcopy_exe )
279
-
280
85
if opts .exclude_libs :
281
86
cmd = remove_libs (cmd , opts .exclude_libs )
282
87
0 commit comments