Fix #188 - create a tool to collect non-alias and dump all metadata (#255)

microdataxyz · web-flow · commit 93c5bb520d13 · 2023-04-11T11:34:33.000+08:00
diff --git a/docs/scan_ids.md b/docs/scan_ids.md
@@ -0,0 +1,49 @@
+# Scan IDs tool
+
+We create a new tool for dumping the metadata of all non-alias IDs into JSON.
+
+We could run it with the following command:
+```commandline
+python -m tools.scan_ids
+```
+
+The result looks like:
+```json
+[
+  {
+    "package_name": "idnumbers.nationalid.yugoslavia",
+    "country_code": "yugoslavia",
+    "ids": [
+      {
+        "class_name": "UniqueMasterCitizenNumber",
+        "metadata": {
+          "iso3166_alpha2": null,
+          "min_length": 13,
+          "max_length": 13,
+          "parsable": true,
+          "checksum": true,
+          "regexp": "^(?P<dd>\\d{2})(?P<mm>\\d{2})(?P<yyy>\\d{3})(?P<location>\\d{2})(?P<sn>\\d{3})(?P<checksum>\\d)$",
+          "alias_of": null,
+          "names": [
+            "Unique  master citizen number",
+            "JMBG",
+            "Jedinstveni mati\u010dni broj gra\u0111ana",
+            "\u0408\u0435\u0434\u0438\u043d\u0441\u0442\u0432\u0435\u043d\u0438 \u043c\u0430\u0442\u0438\u0447\u043d\u0438 \u0431\u0440\u043e\u0458 \u0433\u0440\u0430\u0452\u0430\u043d\u0430",
+            "\u0408\u041c\u0411\u0413",
+            "\u0415\u0434\u0438\u043d\u0441\u0442\u0432\u0435\u043d \u043c\u0430\u0442\u0438\u0447\u0435\u043d \u0431\u0440\u043e\u0458 \u043d\u0430 \u0433\u0440\u0430\u0453\u0430\u043d\u0438\u043d\u043e\u0442",
+            "\u0415\u041c\u0411\u0413",
+            "Enotna mati\u010dna \u0161tevilka ob\u010dana,",
+            "EM\u0160O"
+          ],
+          "links": [
+            "https://en.wikipedia.org/wiki/Unique_Master_Citizen_Number"
+          ],
+          "deprecated": false
+        }
+      }
+    ]
+  }
+]
+```
+
+We could use the info/JSON to build a reference doc or generating the sample codes.
diff --git a/idnumbers/nationalid/smr/social_security.py b/idnumbers/nationalid/smr/social_security.py
@@ -36,31 +36,5 @@ def validate(id_number: str) -> bool:
         return validate_regexp(id_number, SocialSecurityNumber.METADATA.regexp)
 
 
-class TaxRegistrationNumber:
-    """
-    San Marino, entity tax registration number, COE number
-    https://www.oecd.org/tax/automatic-exchange/crs-implementation-and-assistance/tax-identification-numbers/San-Marino-TIN.pdf
-    """
-    METADATA = SimpleNamespace(**{
-        'iso3166_alpha2': 'SM',
-        'min_length': 7,
-        'max_length': 7,
-        # length without insignificant chars
-        'parsable': False,
-        # has parse function
-        'checksum': False,
-        # has checksum function
-        'regexp': re.compile(r'^SM\d{5}$')
-        # regular expression to validate the id
-    })
-
-    @staticmethod
-    def validate(id_number: str) -> bool:
-        """
-        Validate
-        """
-        return validate_regexp(id_number, TaxRegistrationNumber.METADATA.regexp)
-
-
 SSI = SocialSecurityNumber
 """alias of SocialSecurityNumber"""
diff --git a/tools/__init__.py b/tools/__init__.py
diff --git a/tools/scan_ids.py b/tools/scan_ids.py
@@ -0,0 +1,63 @@
+import importlib
+import inspect
+import json
+import os
+
+
+def collect_ids(package_name, output_filename):
+    # Find the root package directory
+    package_directory = importlib.import_module(package_name).__path__[0]
+
+    # Recursively collect metadata for all modules and classes
+    metadata = []
+    modules_count = 0
+    classes_count = 0
+    country_codes = []
+    for root, _, files in os.walk(package_directory):
+        for file in files:
+            if file.endswith('.py'):
+                # Convert the file path to a package path
+                module_name = os.path.splitext(os.path.relpath(os.path.join(root, file), package_directory))[0]
+                module_name = module_name.replace(os.path.sep, '.')
+                # No upper case module name. They are aliases
+                if module_name == module_name.upper():
+                    continue
+
+                # Import the module and collect metadata for its classes
+                module = importlib.import_module(package_name + '.' + module_name)
+                module_metadata = []
+                for name, obj in inspect.getmembers(module):
+                    if inspect.isclass(obj) and hasattr(obj, 'METADATA') and obj.METADATA.alias_of is None:
+                        cls_metadata = obj.METADATA.__dict__
+                        if type(cls_metadata['regexp']) is not str:
+                            cls_metadata['regexp'] = cls_metadata['regexp'].pattern
+                        module_metadata.append({
+                            'class_name': name,
+                            'metadata': obj.METADATA.__dict__
+                        })
+
+                # Append the module's metadata to the overall list
+                if module_metadata:
+                    modules_count += 1
+                    classes_count += len(module_metadata)
+                    country_code = str(module_name).split('.')[0]
+                    if country_code not in country_codes:
+                        country_codes.append(country_code)
+                    metadata.append({
+                        'package_name': package_name + '.' + module_name,
+                        'country_code': country_code,
+                        'ids': module_metadata
+                    })
+
+    print('----------------------------------------------------------------------------')
+    print(f'Modules: {modules_count}')
+    print(f'Countries: {len(country_codes)}')
+    print(f'IDs: {classes_count}')
+    print('----------------------------------------------------------------------------')
+    # Write the metadata to a JSON file
+    with open(output_filename, 'w') as f:
+        json.dump(metadata, f, indent=2)
+
+
+if __name__ == '__main__':
+    collect_ids('idnumbers.nationalid', 'result.json')