@@ -196,22 +196,30 @@ def _get_update_condition(schema: pyspark.sql.types.StructType) -> str | None:
196196        if  not  has_last_updated_field :
197197            return  None 
198198
199-         # OK, the field exists (which is typical for any FHIR resource tables, as we provide a wide FHIR schema),  
200-         # so we want to conditionally update rows based on the timestamp. 
199+         # OK, the field exists (which is typical for any FHIR resource tables, as we provide a wide 
200+         # FHIR schema),  so we want to conditionally update rows based on the timestamp. 
201201        # 
202-         # We skip the update row if both the table and the update have a lastUpdated value and the update's value 
203-         # is in the past. But err on the side of caution if anything is missing, by taking the update. 
202+         # We skip the update row if both the table and the update have a lastUpdated value and the 
203+         # update's value is in the past. But err on the side of caution if anything is missing, 
204+         # by taking the update. 
204205        # 
205-         # This uses less-than instead of less-than-or-equal just to avoid needless churn. 
206-         # If we eventually decide that sub-second updates are a real concern, we can make it <= and 
207-         # additionally compare versionId. But I don't know how you extracted both versions so quickly. :) 
206+         # This uses less-than-or-equal instead of less-than when comparing the date, because 
207+         # sometimes the ETL will upload different content for the same resource as we update the 
208+         # ETL (for example, we allow-list yet another extension - we still want to re-upload the 
209+         # content with the new extension but same lastUpdated value). This does cause some needless 
210+         # churn on the delta lake side, but we'll have to live with that. 
211+         # 
212+         # If we eventually decide that sub-second updates are a real concern, we can additionally 
213+         # compare versionId. But I don't know how you extracted both versions so quickly. :) 
208214        # 
209215        # The cast-as-timestamp does not seem to noticeably slow us down. 
210-         # If it becomes an issue, we could always actually convert this string column to a real date/time column. 
216+         # If it becomes an issue, we could always actually convert this string column to a real 
217+         # date/time column. 
211218        return  (
212219            "table.meta.lastUpdated is null or " 
213220            "updates.meta.lastUpdated is null or " 
214-             "CAST(table.meta.lastUpdated AS TIMESTAMP) < CAST(updates.meta.lastUpdated AS TIMESTAMP)" 
221+             "CAST(table.meta.lastUpdated AS TIMESTAMP) <= " 
222+             "CAST(updates.meta.lastUpdated AS TIMESTAMP)" 
215223        )
216224
217225    @staticmethod  
0 commit comments