Merge branch 'vitis_accelerator_dev' into makefile_update

alex-yang-upenn · web-flow · commit 9693b2534038 · 2024-06-25T00:09:42.000-07:00
diff --git a/docs/advanced/accelerator.rst b/docs/advanced/accelerator.rst
@@ -75,3 +75,50 @@ The ``predict`` method will send the input data to the PL and return the output
 
     nn = NeuralNetworkOverlay('hls4ml_nn.bit', X_test.shape, y_test.shape)
     y_hw, latency, throughput = nn.predict(X_test, profile=True)
+
+========================
+VitisAccelerator Backend
+========================
+
+The ``VitsAccelerator`` backned makes use of the vitis kernel flow to and streamlines the generation of an hls4ml project targeting PCIe accelerators.
+Vitis accelerator backend supports the following boards:
+
+* `Alveo u50 <https://www.xilinx.com/products/boards-and-kits/alveo/u50.html>`_
+* `Alveo u55c <https://www.xilinx.com/products/boards-and-kits/alveo/u55c.html>`_
+* `Alveo u250 <https://www.xilinx.com/products/boards-and-kits/alveo/u250.html>`_
+* `Versal vck5000 <https://www.xilinx.com/products/boards-and-kits/vck5000.html>`_
+
+The backend also generates an `OpenCL` host code that uploads and runs the kernel on the accelerator card.
+
+Example
+=======
+
+The following example is a modified version of `hsl4ml example 7 <https://github.com/fastmachinelearning/hls4ml-tutorial/blob/master/part7_deployment.ipynb>`_.
+
+.. code-block:: Python
+
+    import hls4ml
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        output_dir='model_3/hls4ml_prj_vitis_accel',
+        backend='VitisAccelerator',
+        board='alveo-u55c',
+        num_kernel=4,
+        num_thread=8,
+        batchsize=8192
+    )
+    hls_model.compile()
+    hls_model.build()
+
+By default the build method generates all the necessary files to run the kernel on the accelerator board. As this can be a long process, there are three build options that target the generation of specific parts of the project:
+
+* `host`: Compiles the host application
+* `hls`: Produces only the kernel's object file
+* `xclbin`: Produces only the kernel's .xclbin file
+
+The generated host code application and the xclbin file can be executed as such:
+
+.. code-block:: Bash
+
+    ./host <myproject>.xclbin
diff --git a/hls4ml/backends/vitis_accelerator/supported_boards.json b/hls4ml/backends/vitis_accelerator/supported_boards.json
@@ -18,8 +18,8 @@
       "memory": {"type": "ddr", "channels": 4, "capacity": 64}
     },
     "vck5000": {
-      "board_type": "alveo-versal",
-      "part": "xcvc1902-2msevsvd1760", 
+      "board_type": "versal",
+      "part": "xcvc1902-vsvd1760-2MP-e-S",
       "platform": "xilinx_vck5000_gen4x8_qdma_2_202220_1",
       "memory":{"type": "ddr", "channels": 3, "capacity": 12}
     }
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
@@ -21,7 +21,8 @@ def create_initial_config(
         io_type='io_parallel',
         num_kernel=1,
         num_thread=1,
-        batchsize=8192
+        batchsize=8192,
+        vivado_directives=[]
     ):
         '''
         Create initial accelerator config with default parameters
@@ -32,6 +33,8 @@ def create_initial_config(
             io_type: io_parallel or io_stream
             num_kernel: how many compute units to create on the fpga
             num_thread: how many threads the host cpu uses to drive the fpga
+            batchsize: how many samples to process within a single buffer on the fpga
+            vivado_directives: Directives passed down to Vivado that controls the hardware synthesis and implementation steps
         Returns:
             populated config
         '''
@@ -42,6 +45,7 @@ def create_initial_config(
         config['AcceleratorConfig']['Num_Kernel'] = num_kernel
         config['AcceleratorConfig']['Num_Thread'] = num_thread
         config['AcceleratorConfig']['Batchsize'] = batchsize
+        config['AcceleratorConfig']['Vivado_Directives'] = vivado_directives
         return config
 
     def build(self, model, reset=False, synth=True, vsynth=True, csim=False, cosim=False, debug=False, **kwargs):
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
@@ -30,7 +30,9 @@ def __init__(self, config):
         
         self.num_kernel = accel_config.get('Num_Kernel')
         self.num_thread = accel_config.get('Num_Thread')
-        self.batchsize = accel_config.get('Batchsize')        
+        self.batchsize = accel_config.get('Batchsize')
+
+        self.vivado_directives = accel_config.get('Vivado_Directives')    
 
     def get_board_type(self):
         return self.board_type
@@ -52,3 +54,6 @@ def get_memory_type(self):
     
     def get_memory_channel_count(self):
         return self.memory_channel_count
+    
+    def get_vivado_directives(self):
+        return self.vivado_directives
diff --git a/hls4ml/backends/vitis_accelerator/vivado_directives.json b/hls4ml/backends/vitis_accelerator/vivado_directives.json
@@ -0,0 +1,137 @@
+{
+    "impl.strategies": [
+        "Performance_Explore",
+        "Performance_ExplorePostRoutePhysOpt",
+        "Performance_LBlockPlacement",
+        "Performance_LBlockPlacementFanoutOpt",
+        "Performance_NetDelay_high",
+        "Performance_NetDelay_low",
+        "Performance_Retiming",
+        "Performance_ExtraTimingOpt",
+        "Performance_RefinePlacement",
+        "Performance_SpreadSLL",
+        "Performance_BalanceSLL",
+        "Congestion_SpreadLogic_high",
+        "Congestion_SpreadLogic_medium",
+        "Congestion_SpreadLogic_low",
+        "Congestion_SpreadLogic_Explore",
+        "Congestion_SSI_SpreadLogic_high",
+        "Congestion_SSI_SpreadLogic_low",
+        "Area_Explore",
+        "Area_ExploreSequential",
+        "Area_ExploreWithRemap",
+        "Power_DefaultOpt",
+        "Power_ExploreArea",
+        "Flow_RunPhysOpt",
+        "Flow_RunPostRoutePhysOpt",
+        "Flow_RuntimeOptimized",
+        "Flow_Quick",
+        "ALL"
+    ],
+    "prop": {
+        "run": {
+            "impl": {
+                "STEPS": {
+                    "OPT_DESIGN": {
+                        "ARGS": {
+                            "DIRECTIVE": [
+                                "Explore",
+                                "ExploreArea",
+                                "ExploreSequentialArea",
+                                "RuntimeOptimized",
+                                "ExploreWithRemap"
+                            ]
+                        }
+                    },
+                    "POWER_OPT_DESIGN": {
+                        "IS_ENABLED": [
+                            "true"
+                        ]
+                    },
+                    "PLACE_DESIGN": {
+                        "ARGS": {
+                            "DIRECTIVE": [
+                                "Explore",
+                                "WLDrivenBlockPlacement",
+                                "EarlyBlockPlacement",
+                                "ExtraNetDelay_high",
+                                "ExtraNetDelay_low",
+                                "SSI_SpreadLogic_high",
+                                "SSI_SpreadLogic_low",
+                                "AltSpreadLogic_high",
+                                "AltSpreadLogic_medium",
+                                "AltSpreadLogic_low",
+                                "ExtraPostPlacementOpt",
+                                "ExtraTimingOpt",
+                                "SSI_SpreadSLLs",
+                                "SSI_BalanceSLLs",
+                                "SSI_Balance_SLRs",
+                                "SSI_HighUtilSLRs",
+                                "RuntimeOptimized",
+                                "Quick",
+                                "Auto_1",
+                                "Auto_2",
+                                "Auto_3"
+                            ]
+                        }
+                    },
+                    "POST_PLACE_POWER_OPT_DESIGN": {
+                        "IS_ENABLED": [
+                            "true"
+                        ]
+                    },
+                    "PHYS_OPT_DESIGN": {
+                        "IS_ENABLED": [
+                            "true"
+                        ],
+                        "ARGS": {
+                            "DIRECTIVE": [
+                                "Explore",
+                                "ExploreWithHoldFix",
+                                "ExploreWithAggressiveHoldFix",
+                                "AggressiveExplore",
+                                "AlternateReplication",
+                                "AggressiveFanoutOpt",
+                                "AddRetime",
+                                "AlternateFlowWithRetiming",
+                                "RuntimeOptimized"
+                            ]
+                        }
+                    },
+                    "ROUTE_DESIGN": {
+                        "ARGS": {
+                            "DIRECTIVE": [
+                                "Explore",
+                                "AggressiveExplore",
+                                "NoTimingRelaxation",
+                                "MoreGlobalIterations",
+                                "HigherDelayCost",
+                                "RuntimeOptimized",
+                                "AlternateCLBRouting",
+                                "Quick"
+                            ]
+                        }
+                    },
+                    "POST_ROUTE_PHYS_OPT_DESIGN": {
+                        "IS_ENABLED": [
+                            "true"
+                        ],
+                        "ARGS": {
+                            "DIRECTIVE": [
+                                "Explore",
+                                "ExploreWithHoldFix",
+                                "ExploreWithAggressiveHoldFix",
+                                "AggressiveExplore",
+                                "AlternateReplication",
+                                "AggressiveFanoutOpt",
+                                "AddRetime",
+                                "AlternateFlowWithRetiming",
+                                "RuntimeOptimized"
+                            ]
+                        }
+                    }
+                }
+            } 
+        }
+    } 
+}
diff --git a/hls4ml/templates/vitis_accelerator/accelerator_card.cfg b/hls4ml/templates/vitis_accelerator/accelerator_card.cfg
@@ -7,19 +7,8 @@ prop=kernel.kernel_wrapper.kernel_flags=-std=c++11
 
 [hls]
 pre_tcl=./hls_config.tcl
+# hls-fpga-machine-learning clock control
 
 # hls-fpga-machine-learning kernel control
 
-[vivado]
-prop=run.impl_1.STEPS.OPT_DESIGN.IS_ENABLED=true
-prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=Explore
-
-prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=AltSpreadLogic_high
-
-prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=true
-prop=run.imp1_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=AggressiveExplore
-
-prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
-
-prop=run.impl_1.STEPS.POST_ROUTE_PHYS_OPT_DESIGN.IS_ENABLED=true
-prop=run.impl_1.STEPS.POST_ROUTE_PHYS_OPT_DESIGN.ARGS.DIRECTIVE=AggressiveExplore
+# hls-fpga-machine-learning vivado directives
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
@@ -221,13 +221,18 @@ def write_accelerator_card_cfg(self, model):
                 raise Exception(format(self.vitis_accelerator_config.get_platform()) + 
                                 ' has only ' + format(num_channels) + ' memory banks.')
         
+        directives = self.vitis_accelerator_config.get_vivado_directives()
+        
         for line in f.readlines():
             if 'MYPLATFORM' in line:
                 newline = line.replace('MYPLATFORM', format(self.vitis_accelerator_config.get_platform()))
+            elif "# hls-fpga-machine-learning clock control" in line:
+                freq = round(1e9 / model.config.get_config_value('ClockPeriod'))
+                newline = 'clock={}:kernel_wrapper\n'.format(freq)
             elif '# hls-fpga-machine-learning kernel control' in line:
                 newline = '[connectivity]\n'
                 newline += 'nk=kernel_wrapper:' + format(num_kernels) + '\n\n'
-                if self.vitis_accelerator_config.get_board_type() == "alveo":
+                if self.vitis_accelerator_config.get_board_type() == 'alveo':
                     if memory_type == 'hbm':
                         for i in range(0, num_kernels):
                             newline += 'sp=kernel_wrapper_{}.in:HBM[{}:{}]\n'.format(i + 1, (i*2)*num_channels_per_cu, ((i*2 + 1)*num_channels_per_cu) - 1)
@@ -239,18 +244,27 @@ def write_accelerator_card_cfg(self, model):
                             newline += '\n'
                         for i in range(0, num_kernels):
                             newline += 'slr=kernel_wrapper_{}:SLR{}\n'.format(i + 1, i)
+            elif '# hls-fpga-machine-learning vivado directives' in line:
+                newline = ''
+                if directives:
+                    newline += '[vivado]\n'
+                    for x in directives:
+                        newline += x + '\n'
             else:
                 newline = line
             fout.write(newline)
         f.close()
         fout.close()
 
-        # Copy hls_config.tcl
-        filedir = os.path.dirname(os.path.abspath(__file__))
-        srcpath = os.path.join(filedir, '../templates/vitis_accelerator/hls_config.tcl')
-        dstpath = f'{model.config.get_output_dir()}/hls_config.tcl'
-        copy(srcpath, dstpath)
-
+        # Write hls_config.tcl
+        tcl_f = open(os.path.join(filedir, '../templates/vitis_accelerator/hls_config.tcl'))
+        tcl_fout = open(f'{model.config.get_output_dir()}/hls_config.tcl', 'w')
+        for line in tcl_f.readlines():
+            newline = line
+            tcl_fout.write(newline)
+        tcl_fout.write('\nset_clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
+        tcl_f.close()
+        tcl_fout.close() 
 
     def write_nnet_utils_overrides(self, model):
         """Override nnet_types.h pointer comparison