|
38 | 38 | from django.template.defaultfilters import pluralize
|
39 | 39 |
|
40 | 40 | from commoncode.paths import common_prefix
|
| 41 | +from elf_inspector.dwarf import get_dwarf_paths |
41 | 42 | from extractcode import EXTRACT_SUFFIX
|
| 43 | +from go_inspector.plugin import collect_and_parse_symbols |
42 | 44 | from packagedcode.npm import NpmPackageJsonHandler
|
43 | 45 | from summarycode.classify import LEGAL_STARTS_ENDS
|
44 | 46 |
|
@@ -1662,3 +1664,224 @@ def _match_purldb_resources_post_process(
|
1662 | 1664 | package.add_resources(unmapped_resources)
|
1663 | 1665 |
|
1664 | 1666 | return interesting_codebase_resources.count()
|
| 1667 | + |
| 1668 | + |
| 1669 | +def map_paths_resource( |
| 1670 | + to_resource, from_resources, from_resources_index, map_types, logger=None |
| 1671 | +): |
| 1672 | + """ |
| 1673 | + Map paths found in the ``to_resource`` extra_data to paths of the ``from_resources`` |
| 1674 | + CodebaseResource queryset using the precomputed ``from_resources_index`` path index. |
| 1675 | + """ |
| 1676 | + # Accumulate unique relation objects for bulk creation |
| 1677 | + relations_to_create = {} |
| 1678 | + |
| 1679 | + for map_type in map_types: |
| 1680 | + # These are of type string |
| 1681 | + paths_in_binary = to_resource.extra_data.get(map_type, []) |
| 1682 | + paths_not_mapped = to_resource.extra_data[f"{map_type}_not_mapped"] = [] |
| 1683 | + for item in process_paths_in_binary( |
| 1684 | + to_resource=to_resource, |
| 1685 | + from_resources=from_resources, |
| 1686 | + from_resources_index=from_resources_index, |
| 1687 | + map_type=map_type, |
| 1688 | + paths_in_binary=paths_in_binary, |
| 1689 | + ): |
| 1690 | + if isinstance(item, str): |
| 1691 | + paths_not_mapped.append(item) |
| 1692 | + else: |
| 1693 | + rel_key, relation = item |
| 1694 | + if rel_key not in relations_to_create: |
| 1695 | + relations_to_create[rel_key] = relation |
| 1696 | + if paths_not_mapped: |
| 1697 | + to_resource.save() |
| 1698 | + logger( |
| 1699 | + f"WARNING: #{len(paths_not_mapped)} {map_type} paths NOT mapped for: " |
| 1700 | + f"{to_resource.path!r}" |
| 1701 | + ) |
| 1702 | + |
| 1703 | + if relations_to_create: |
| 1704 | + rels = CodebaseRelation.objects.bulk_create(relations_to_create.values()) |
| 1705 | + logger( |
| 1706 | + f"Created {len(rels)} mappings using " |
| 1707 | + f"{', '.join(map_types)} for: {to_resource.path!r}" |
| 1708 | + ) |
| 1709 | + else: |
| 1710 | + logger( |
| 1711 | + f"No mappings using {', '.join(map_types)} for: " f"{to_resource.path!r}" |
| 1712 | + ) |
| 1713 | + |
| 1714 | + |
| 1715 | +def process_paths_in_binary( |
| 1716 | + to_resource, |
| 1717 | + from_resources, |
| 1718 | + from_resources_index, |
| 1719 | + map_type, |
| 1720 | + paths_in_binary, |
| 1721 | +): |
| 1722 | + """ |
| 1723 | + Process list of paths in binary and Yield either: |
| 1724 | + - a tuple of (unique key for a relationship, ``CodebaseRelation`` object) |
| 1725 | + - Or a path if it was not mapped |
| 1726 | + """ |
| 1727 | + for path in paths_in_binary: |
| 1728 | + match = pathmap.find_paths(path, from_resources_index) |
| 1729 | + if not match: |
| 1730 | + yield path |
| 1731 | + continue |
| 1732 | + |
| 1733 | + matched_path_length = match.matched_path_length |
| 1734 | + if is_invalid_match(match, matched_path_length): |
| 1735 | + yield path |
| 1736 | + continue |
| 1737 | + |
| 1738 | + matched_from_resources = [ |
| 1739 | + from_resources.get(id=rid) for rid in match.resource_ids |
| 1740 | + ] |
| 1741 | + matched_from_resources = sort_matched_from_resources(matched_from_resources) |
| 1742 | + winning_from_resource = matched_from_resources[0] |
| 1743 | + |
| 1744 | + path_length = count_path_segments(path) - 1 |
| 1745 | + extra_data = { |
| 1746 | + "path_score": f"{matched_path_length}/{path_length}", |
| 1747 | + map_type: path, |
| 1748 | + } |
| 1749 | + |
| 1750 | + rel_key = (winning_from_resource.path, to_resource.path, map_type) |
| 1751 | + relation = CodebaseRelation( |
| 1752 | + project=winning_from_resource.project, |
| 1753 | + from_resource=winning_from_resource, |
| 1754 | + to_resource=to_resource, |
| 1755 | + map_type=map_type, |
| 1756 | + extra_data=extra_data, |
| 1757 | + ) |
| 1758 | + yield rel_key, relation |
| 1759 | + |
| 1760 | + |
| 1761 | +def count_path_segments(path): |
| 1762 | + """Return the number of path segments in POSIX ``path`` string""" |
| 1763 | + return len(path.strip("/").split("/")) |
| 1764 | + |
| 1765 | + |
| 1766 | +def sort_matched_from_resources(matched_from_resources): |
| 1767 | + """ |
| 1768 | + Return the sorted list of ``matched_from_resources`` |
| 1769 | + based on path length and path. |
| 1770 | + """ |
| 1771 | + |
| 1772 | + def sorter(res): |
| 1773 | + return count_path_segments(res.path), res.path |
| 1774 | + |
| 1775 | + return sorted(matched_from_resources, key=sorter) |
| 1776 | + |
| 1777 | + |
| 1778 | +def is_invalid_match(match, matched_path_length): |
| 1779 | + """ |
| 1780 | + Check if the match is invalid based on the ``matched_path_length`` and the number |
| 1781 | + of resource IDs. |
| 1782 | + """ |
| 1783 | + return matched_path_length == 1 and len(match.resource_ids) != 1 |
| 1784 | + |
| 1785 | + |
| 1786 | +def map_elfs(project, logger=None): |
| 1787 | + """Map ELF binaries to their sources in ``project``.""" |
| 1788 | + from_resources = project.codebaseresources.files().from_codebase() |
| 1789 | + to_resources = ( |
| 1790 | + project.codebaseresources.files().to_codebase().has_no_relation().elfs() |
| 1791 | + ) |
| 1792 | + for resource in to_resources: |
| 1793 | + try: |
| 1794 | + paths = get_elf_file_dwarf_paths(resource.location_path) |
| 1795 | + resource.update_extra_data(paths) |
| 1796 | + except Exception as e: |
| 1797 | + logger(f"Can not parse {resource.location_path!r} {e!r}") |
| 1798 | + |
| 1799 | + if logger: |
| 1800 | + logger( |
| 1801 | + f"Mapping {to_resources.count():,d} to/ resources using paths " |
| 1802 | + f"with {from_resources.count():,d} from/ resources." |
| 1803 | + ) |
| 1804 | + |
| 1805 | + from_resources_index = pathmap.build_index( |
| 1806 | + from_resources.values_list("id", "path"), with_subpaths=True |
| 1807 | + ) |
| 1808 | + |
| 1809 | + if logger: |
| 1810 | + logger("Done building from/ resources index.") |
| 1811 | + |
| 1812 | + resource_iterator = to_resources.iterator(chunk_size=2000) |
| 1813 | + progress = LoopProgress(to_resources.count(), logger) |
| 1814 | + for to_resource in progress.iter(resource_iterator): |
| 1815 | + map_paths_resource( |
| 1816 | + to_resource, |
| 1817 | + from_resources, |
| 1818 | + from_resources_index, |
| 1819 | + map_types=["dwarf_compiled_paths", "dwarf_included_paths"], |
| 1820 | + logger=logger, |
| 1821 | + ) |
| 1822 | + |
| 1823 | + |
| 1824 | +def get_elf_file_dwarf_paths(location): |
| 1825 | + """Retrieve dwarf paths for ELF files.""" |
| 1826 | + paths = get_dwarf_paths(location) |
| 1827 | + compiled_paths = paths.get("compiled_paths") or [] |
| 1828 | + included_paths = paths.get("included_paths") or [] |
| 1829 | + dwarf_paths = {} |
| 1830 | + if compiled_paths: |
| 1831 | + dwarf_paths["dwarf_compiled_paths"] = compiled_paths |
| 1832 | + if included_paths: |
| 1833 | + dwarf_paths["dwarf_included_paths"] = included_paths |
| 1834 | + return dwarf_paths |
| 1835 | + |
| 1836 | + |
| 1837 | +def get_go_file_paths(location): |
| 1838 | + """Retrieve Go file paths.""" |
| 1839 | + go_symbols = ( |
| 1840 | + collect_and_parse_symbols(location, check_type=False).get("go_symbols") or {} |
| 1841 | + ) |
| 1842 | + file_paths = {} |
| 1843 | + go_file_paths = go_symbols.get("file_paths") or [] |
| 1844 | + if go_file_paths: |
| 1845 | + file_paths["go_file_paths"] = go_file_paths |
| 1846 | + return file_paths |
| 1847 | + |
| 1848 | + |
| 1849 | +def map_go_paths(project, logger=None): |
| 1850 | + """Map Go binaries to their source in ``project``.""" |
| 1851 | + from_resources = project.codebaseresources.files().from_codebase() |
| 1852 | + to_resources = ( |
| 1853 | + project.codebaseresources.files() |
| 1854 | + .to_codebase() |
| 1855 | + .has_no_relation() |
| 1856 | + .executable_binaries() |
| 1857 | + ) |
| 1858 | + for resource in to_resources: |
| 1859 | + try: |
| 1860 | + paths = get_go_file_paths(resource.location_path) |
| 1861 | + resource.update_extra_data(paths) |
| 1862 | + except Exception as e: |
| 1863 | + logger(f"Can not parse {resource.location_path!r} {e!r}") |
| 1864 | + |
| 1865 | + if logger: |
| 1866 | + logger( |
| 1867 | + f"Mapping {to_resources.count():,d} to/ resources using paths " |
| 1868 | + f"with {from_resources.count():,d} from/ resources." |
| 1869 | + ) |
| 1870 | + |
| 1871 | + from_resources_index = pathmap.build_index( |
| 1872 | + from_resources.values_list("id", "path"), with_subpaths=True |
| 1873 | + ) |
| 1874 | + |
| 1875 | + if logger: |
| 1876 | + logger("Done building from/ resources index.") |
| 1877 | + |
| 1878 | + resource_iterator = to_resources.iterator(chunk_size=2000) |
| 1879 | + progress = LoopProgress(to_resources.count(), logger) |
| 1880 | + for to_resource in progress.iter(resource_iterator): |
| 1881 | + map_paths_resource( |
| 1882 | + to_resource, |
| 1883 | + from_resources, |
| 1884 | + from_resources_index, |
| 1885 | + map_types=["go_file_paths"], |
| 1886 | + logger=logger, |
| 1887 | + ) |
0 commit comments