IFRCGo · Shhhhhubh · Apr 7, 2025 · Apr 9, 2025 · Apr 10, 2025 · Mar 27, 2025
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
+.DS_Store
 
 # C extensions
 *.so
@@ -32,6 +33,7 @@ MANIFEST
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
+*.sqlite
 
 # Installer logs
 pip-log.txt
@@ -132,4 +134,7 @@ dmypy.json
 
 # Media/Data
 assets/
+media/
 *.gpkg
+
+apps/etl/Dataset
diff --git a/Dockerfile b/Dockerfile
@@ -20,14 +20,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     apt-get update -y \
     && apt-get install -y --no-install-recommends \
         # Build required packages
-        gcc libc-dev gdal-bin libproj-dev \
+        build-essential gcc libc-dev gdal-bin libgdal-dev libproj-dev \
         # Helper packages
         procps \
         wait-for-it \
     # FIXME: Add condition to skip dev dependencies
     && uv sync --frozen --no-install-project --all-groups \
     # Clean-up
-    && apt-get remove -y gcc libc-dev libproj-dev \
+    && apt-get remove -y gcc libc-dev libproj-dev build-essential libgdal-dev  \
     && apt-get autoremove -y \
     && rm -rf /var/lib/apt/lists/*
 

diff --git a/Dockerfile.save b/Dockerfile.save
@@ -0,0 +1,49 @@
+0;9u<<<<<<< HEAD
+FROM python:3.12-slim-bookworm AS base
+COPY --from=ghcr.io/astral-sh/uv:0.5.29 /uv /uvx /bin/
+||||||| parent of 074f51d (Upgrade to bookworm)
+FROM python:3.12-slim-bullseye AS base
+COPY --from=ghcr.io/astral-sh/uv:0.5.29 /uv /uvx /bin/
+=======
+FROM python:3.13-slim-bookworm AS base
+COPY --from=ghcr.io/astral-sh/uv:0.6.8 /uv /uvx /bin/
+>>>>>>> 074f51d (Upgrade to bookworm)
+
+LABEL maintainer="Montandon Dev"
+LABEL org.opencontainers.image.source="https://github.com/IFRCGo/montandon-etl/"
+
+ENV PYTHONUNBUFFERED=1
+
+ENV UV_COMPILE_BYTECODE=1
+ENV UV_LINK_MODE=copy
+ENV UV_PROJECT_ENVIRONMENT="/usr/local/"
+
+WORKDIR /code
+
+COPY libs /code/libs
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+        # Build required packages
+        build-essential gcc libc-dev gdal-bin libgdal-dev libproj-dev \
+        # Helper packages
+        procps \
+        wait-for-it \
+<<<<<<< HEAD
+    && uv sync --frozen --no-install-project --all-groups \
+||||||| parent of 074f51d (Upgrade to bookworm)
+    && uv sync --frozen --no-install-project  --no-dev \
+=======
+    && uv lock --locked --offline \
+        # FIXME: Add condition to skip dev dependencies
+        && uv sync --frozen --no-install-project --all-groups \
+>>>>>>> 074f51d (Upgrade to bookworm)
+    # Clean-up
+    && apt-get remove -y gcc libc-dev libproj-dev build-essential libgdal-dev  \
+    && apt-get autoremove -y \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY . /code/
diff --git a/apps/etl/admin.py b/apps/etl/admin.py
@@ -108,7 +108,7 @@ class TransformAdmin(EtlResourceAdminMixin, AdminReadOnlyMixin, DjangoQLSearchMi
         "success_percentage",
         "total_rows",
     )
-    list_filter = ("status",)
+    list_filter = ("status", "extraction_id__source")
     autocomplete_fields = ["extraction"]
     search_fields = ["extraction"]
 
@@ -140,10 +140,16 @@ class PyStacLoadDataAdmin(AdminReadOnlyMixin, DjangoQLSearchMixin, admin.ModelAd
     list_filter = (
         "item_type",
         "status",
+        "transform_id__extraction_id__source",
     )
+    list_filter = ("item_type", "status", "transform_id__extraction_id__source")
     autocomplete_fields = ["transform_id"]
     search_fields = ["transform_id"]
 
+    @admin.display(description="Source", ordering="source")
+    def source(self, instance):
+        return ExtractionData.Source(instance.source).label
+
     def get_queryset(self, request):
         # NOTE: item contains heavy json data
         return super().get_queryset(request).defer("item")
diff --git a/apps/etl/etl_tasks/desinventar.py b/apps/etl/etl_tasks/desinventar.py
@@ -1,9 +1,14 @@
 import logging
 
-from celery import chain, shared_task
+from celery import shared_task
 
-from apps.etl.extraction.sources.desinventar.extract import DesinventarExtraction, DesInventarExtractionInputMetadata
-from apps.etl.transform.sources.desinventar import DesinventarTransformHandler
+from apps.etl.extraction.sources.desinventar.extract import (
+    DesInventarExtraction,
+    DesInventarExtractionMetadata,
+    DesInventarExtractionParamsMetadata,
+    DesInventarMetadataType,
+)
+from main.configs import etl_config
 
 logger = logging.getLogger(__name__)
 
@@ -119,22 +124,20 @@
 @shared_task
 def ext_and_transform_desinventar_historical_data():
     for country_code in country_code_iso3_list:
-        metadata = DesInventarExtractionInputMetadata(
-            country_code=country_code,
-            iso3=country_code,
-        ).model_dump()
-        chain(
-            DesinventarExtraction.task.s(metadata),
-            DesinventarTransformHandler.task.s(),
-        ).apply_async()
-
-    # FIXME: country_code should be region_code
+        url = f"{etl_config.DESINVENTAR_DATA_URL}/DesInventar/download/DI_export_{country_code}.zip"
+        DesInventarExtraction.init_extraction(
+            metadata=DesInventarExtractionMetadata(
+                url=url,
+                type=DesInventarMetadataType.QUERY,
+                params=DesInventarExtractionParamsMetadata(country_code=country_code, iso3=country_code),
+            )
+        )
     for country_code, iso3 in additional_region_code_to_iso3_map.items():
-        metadata = DesInventarExtractionInputMetadata(
-            country_code=country_code,
-            iso3=iso3,
-        ).model_dump()
-        chain(
-            DesinventarExtraction.task.s(metadata),
-            DesinventarTransformHandler.task.s(),
-        ).apply_async()
+        url = f"{etl_config.DESINVENTAR_DATA_URL}/DesInventar/download/DI_export_{country_code}.zip"
+        DesInventarExtraction.init_extraction(
+            metadata=DesInventarExtractionMetadata(
+                url=url,
+                type=DesInventarMetadataType.QUERY,
+                params=DesInventarExtractionParamsMetadata(country_code=country_code, iso3=country_code),
+            )
+        )
diff --git a/apps/etl/etl_tasks/emdat.py b/apps/etl/etl_tasks/emdat.py
@@ -2,8 +2,14 @@
 
 from celery import chain, shared_task
 
-from apps.etl.extraction.sources.emdat.extract import EMDATExtraction, EmdatExtractionInputMetadata
+from apps.etl.extraction.sources.emdat.extract import (
+    EmdatExtraction,
+    EmdatExtractionMetadata,
+    EmdatExtractionMetadataType,
+    EmdatExtractionParamsMetadata,
+)
 from apps.etl.transform.sources.emdat import EMDATTransformHandler
+from apps.etl.utils import get_cluster_codes
 from main.configs import etl_config
 
 QUERY = """
@@ -13,11 +19,12 @@
     $include_hist: Boolean
     $from: Int
     $to: Int
+    $classif: [String!]
 ) {
     api_version
     public_emdat(
         cursor: { offset: $offset, limit: $limit }
-        filters: { include_hist: $include_hist, from: $from, to: $to }
+        filters: { include_hist: $include_hist, from: $from, to: $to , classif: $classif}
     ) {
         total_available
         info {
@@ -78,39 +85,40 @@
 """
 
 
-# FIXME: Remove kwargs?
 @shared_task
 def ext_and_transform_emdat_latest_data(**kwargs):
-    # FIXME: Why are we getting data from etl_config.EMDAT_START_YEAR to get the latest data?
-    # Also, the filtering only filters using year so we might have lot of duplicate data
-    variables = EmdatExtractionInputMetadata.model_validate(
-        {
-            "limit": -1,
-            "from": etl_config.EMDAT_START_YEAR,
-            "to": datetime.now().year,
-            "include_hist": None,
-        }
-    ).model_dump(by_alias=True)
-
-    chain(
-        EMDATExtraction.task.s(QUERY, variables),
-        EMDATTransformHandler.task.s(),
-    ).apply_async()
+    extraction_object = EmdatExtraction.init_extraction(
+        metadata=EmdatExtractionMetadata(
+            params=EmdatExtractionParamsMetadata(
+                limit=-1,
+                from_=etl_config.EMDAT_START_YEAR,
+                to=datetime.now().year,
+                include_hist=None,
+                classif=get_cluster_codes(),
+            ),
+            url=f"{etl_config.EMDAT_URL}/v1",
+            type=EmdatExtractionMetadataType.QUERY,
+        ),
+        add_to_queue=False,
+    )
+    chain(EmdatExtraction.task.s(extraction_object.id), EMDATTransformHandler.task.s()).apply_async()
 
 
-# FIXME: Remove kwargs?
 @shared_task
 def ext_and_transform_emdat_historical_data(**kwargs):
-    variables = EmdatExtractionInputMetadata.model_validate(
-        {
-            "limit": -1,
-            "from": None,
-            "to": None,
-            "include_hist": True,
-        }
-    ).model_dump(by_alias=True)
-
-    chain(
-        EMDATExtraction.task.s(QUERY, variables),
-        EMDATTransformHandler.task.s(),
-    ).apply_async()
+    for i in range(etl_config.EMDAT_START_YEAR, etl_config.EMDAT_END_YEAR + 1):
+        extraction_object = EmdatExtraction.init_extraction(
+            metadata=EmdatExtractionMetadata(
+                params=EmdatExtractionParamsMetadata(
+                    limit=-1,
+                    from_=i,
+                    to=i,
+                    include_hist=True,
+                    classif=get_cluster_codes(),
+                ),
+                url=f"{etl_config.EMDAT_URL}/v1",
+                type=EmdatExtractionMetadataType.QUERY,
+            ),
+            add_to_queue=False,
+        )
+        chain(EmdatExtraction.task.s(extraction_object.id), EMDATTransformHandler.task.s()).apply_async()