Skip to content

[WIP] Updated land calibration pipeline #1210

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 18 additions & 10 deletions .buildkite/Manifest-v1.11.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

julia_version = "1.11.5"
manifest_format = "2.0"
project_hash = "c057205b5dfa45d173ab89f45dff9c1d2ab4a915"
project_hash = "d276b63cf59ce984bda717bcbae4d5f8e80fcd7a"

[[deps.ADTypes]]
git-tree-sha1 = "be7ae030256b8ef14a441726c4c37766b90b93a3"
Expand Down Expand Up @@ -416,7 +416,9 @@ weakdeps = ["SparseArrays"]

[[deps.ClimaAnalysis]]
deps = ["Artifacts", "Dates", "Interpolations", "NCDatasets", "NaNStatistics", "OrderedCollections", "Reexport", "Statistics", "Unitful"]
git-tree-sha1 = "79279dce43bac22423b5d7b83fdf8209bf00a331"
git-tree-sha1 = "a9c31362bc7c0ba771fc3bc8f66ff5dc99e18ce1"
repo-rev = "main"
repo-url = "https://github.com/CliMA/ClimaAnalysis.jl.git"
uuid = "29b5916a-a76c-4e73-9657-3c8fd22e65e6"
version = "0.5.18"
weakdeps = ["GeoMakie", "Makie"]
Expand All @@ -427,7 +429,9 @@ weakdeps = ["GeoMakie", "Makie"]

[[deps.ClimaCalibrate]]
deps = ["Dates", "Distributed", "Distributions", "EnsembleKalmanProcesses", "JLD2", "Logging", "Random", "TOML", "YAML"]
git-tree-sha1 = "2408ba55a30ac0f67550ef9d87cebfc372af3cca"
git-tree-sha1 = "fe9ddc24349035098116663fd0136e960d3cbb3c"
repo-rev = "kp/identity"
repo-url = "https://github.com/CliMA/ClimaCalibrate.jl.git"
uuid = "4347a170-ebd6-470c-89d3-5c705c0cacc2"
version = "0.1.2"

Expand Down Expand Up @@ -505,7 +509,9 @@ weakdeps = ["BenchmarkTools", "CUDA", "OrderedCollections", "PrettyTables", "Sta

[[deps.ClimaUtilities]]
deps = ["Artifacts", "ClimaComms", "Dates"]
git-tree-sha1 = "420fe76968208ac2eb2837e9e0f0974456ebec78"
git-tree-sha1 = "5a1a32069c2e6ba68a8e41e049701d64f4f9996e"
repo-rev = "main"
repo-url = "https://github.com/CliMA/ClimaUtilities.jl.git"
uuid = "b3f4f4ca-9299-4f7f-bd9b-81e1242a7513"
version = "0.1.24"

Expand Down Expand Up @@ -886,7 +892,9 @@ version = "2.2.4+0"

[[deps.EnsembleKalmanProcesses]]
deps = ["Convex", "Distributions", "DocStringExtensions", "FFMPEG", "GaussianRandomFields", "Interpolations", "LinearAlgebra", "MathOptInterface", "Optim", "QuadGK", "Random", "RecipesBase", "SCS", "SparseArrays", "Statistics", "StatsBase", "TOML", "TSVD"]
git-tree-sha1 = "938ba137333f7be93194cf5ffc4592b68b0efb36"
git-tree-sha1 = "76302edb2f49426a703d0decebf3446e18c958c1"
repo-rev = "main"
repo-url = "https://github.com/CliMA/EnsembleKalmanProcesses.jl.git"
uuid = "aa8a2aa5-91d8-4396-bcef-d4f2ec43552d"
version = "2.4.2"
weakdeps = ["Makie"]
Expand Down Expand Up @@ -1243,9 +1251,9 @@ weakdeps = ["Makie"]

[[deps.GeoMakie]]
deps = ["Colors", "CoordinateTransformations", "Downloads", "GeoFormatTypes", "GeoInterface", "GeoInterfaceMakie", "GeoJSON", "Geodesy", "GeometryBasics", "GeometryOps", "ImageIO", "LinearAlgebra", "Makie", "NaturalEarth", "Proj", "Reexport", "Statistics", "StructArrays"]
git-tree-sha1 = "8ef9756e37ca5bc892e920551f9aa683d5b8fab3"
git-tree-sha1 = "2db1d309af35a1ad440e75b9275e4b3b4715ed39"
uuid = "db073c08-6b98-4ee5-b6a4-5efafb3259c6"
version = "0.7.13"
version = "0.7.12"

[[deps.Geodesy]]
deps = ["CoordinateTransformations", "Dates", "LinearAlgebra", "StaticArrays"]
Expand Down Expand Up @@ -1554,10 +1562,10 @@ uuid = "82899510-4779-5014-852e-03e436cf321d"
version = "1.0.0"

[[deps.JLD2]]
deps = ["FileIO", "MacroTools", "Mmap", "OrderedCollections", "PrecompileTools", "TranscodingStreams"]
git-tree-sha1 = "2f04f74d391f2d119980827ee80ebb1664847721"
deps = ["FileIO", "MacroTools", "Mmap", "OrderedCollections", "PrecompileTools", "ScopedValues", "TranscodingStreams"]
git-tree-sha1 = "d97791feefda45729613fafeccc4fbef3f539151"
uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
version = "0.5.14"
version = "0.5.15"
weakdeps = ["UnPack"]

[deps.JLD2.extensions]
Expand Down
1 change: 1 addition & 0 deletions .buildkite/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ GeoMakie = "db073c08-6b98-4ee5-b6a4-5efafb3259c6"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
Insolation = "e98cc03f-d57e-4e3c-b70c-8d51efe9e0d8"
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down
20 changes: 20 additions & 0 deletions experiments/better_calibration/PBS_calibration.pbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash
#PBS -N derecho_calibration
#PBS -o output.txt
#PBS -e error.txt
#PBS -l walltime=12:00:00
#PBS -l select=1:ncpus=1:ngpus=1

## Account number for CliMA
#PBS -A UCIT0011
#PBS -q main

export PBS_ACCOUNT="UCIT0011"
export MODULEPATH="/glade/campaign/univ/ucit0011/ClimaModules-Derecho:$MODULEPATH"
module load climacommon

export CLIMACOMMS_DEVICE="CUDA"
export CLIMACOMMS_CONTEXT="SINGLETON"
julia --project=.buildkite -e 'using Pkg; Pkg.instantiate(;verbose=true)'

julia --project=.buildkite/ experiments/better_calibration/calibrate_land.jl
175 changes: 175 additions & 0 deletions experiments/better_calibration/generate_observations.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import ClimaAnalysis
import ClimaAnalysis: OutputVar
import ClimaLand
import ClimaCalibrate
import Dates
import ClimaCore
import EnsembleKalmanProcesses as EKP
import JLD2

include("getters.jl")
include("observation_utils.jl")

# TODO: Resolve this comment
# For now, we will use `data_sources.jl` for the leaderboard, since it is
# the easiest option, but it would be better to make your own `data_source.jl`
# and preprocess the observational data to match the simulation data as opposed
# to processing both the simulation and observational data (e.g. with ILAMB data).
include("../long_runs/leaderboard/data_sources.jl")

"""
make_observation(short_name, start_date, end_date, nelements)

Make an observation from the OutputVar with the name `short_name` from
`start_date` to `end_date`.
"""
function make_era5_observation_vector(
covar_estimator,
short_names,
sample_date_ranges,
nelements,
)
# TODO: The start_date argument can be removed by making data_sources.jl not
# depend on a start date, but it is nice to be able to reuse the code for
# the leaderboard and calibration

# The start date doesn't matter since we never resample along the
# time dimension, so we grab the first date in sample_date_ranges
start_date = first(first(sample_date_ranges))
era5_vars = preprocess_era5_vars(short_names, start_date, nelements)
observation_vector = map(sample_date_ranges) do (start_date, end_date)
ClimaCalibrate.ObservationRecipe.observation(
covar_estimator,
era5_vars,
start_date,
end_date,
)
end
return observation_vector
end

"""
preprocess_era5_vars(short_names, start_date)

Preprocess each ERA5 variable.
"""
function preprocess_era5_vars(short_names, start_date, nelements)
era5_obs_vars = get_era5_obs_var_dict()
for short_name in short_names
short_name ∉ keys(era5_obs_vars) && error(
"There is not variable with the short name $short_name. Add this variable to get_era5_obs_var_dict",
)
end
vars = map(short_names) do short_name
var = era5_obs_vars[short_name](start_date)
preprocess_single_era5_var(var, short_name, nelements)
end
return vars
end

"""
preprocess_single_era5_var(var::OutputVar, short_name)

Specify how each individual `OutputVar` from the ERA5 dataset should be
processed.

Note that the individual observation is not generated in this function.
"""
function preprocess_single_era5_var(var::OutputVar, short_name, nelements)
short_name in ("lhf", "shf", "swu", "lwu") || error(
"Do not know how to process a variable with the short name $short_name",
)

lats, lons = diagnostics_lat_lon(nelements)
# Check for `NaN`s
# If there are `NaN`s, then resampling can't be done as resampling is
# not `NaN` aware
any(isnan, var.data) && error(
"Cannot process OutputVar with name $short_name because `NaN`s are in the data",
)

# Window to ensure that each season contain all three months
# The dates are found by inspecting the ERA5 data and choosing
# the earliest and latest dates that would contain full seasons
var = ClimaAnalysis.window(
var,
"time",
left = Dates.DateTime(1979, 3),
right = Dates.DateTime(2024, 8),
by = ClimaAnalysis.MatchValue(),
)

# Take seasonal average, resample, and apply mask
# Resampling is an expensive operation, so it is good to do as much
# reductions that we can.
var = ClimaAnalysis.average_season_across_time(var, ignore_nan = true)

var = ClimaAnalysis.resampled_as(var, lon = lons, lat = lats)

# Cannot apply ClimaLand.apply_oceanmask because of the small
# differences between ClimaLand mask and ClimaAnalysis.apply_ocean_mask
# For now, it is better to manually create a mask from ClimaCore and
# generate a mask from it using ClimaAnalysis

# TODO: Alternatively, if the diagnostics are available, then you can
# generate a mask from the diagnostics (as long as there are no NaNs on
# the land in the output)

# TODO: Finally, the best solution might be to make a landsea mask in
# in the diagnostics (at the first time step and stop after that)
# Problem with this is that the simulation need to run one time
ocean_mask = make_ocean_mask(nelements)
var = ocean_mask(var)

# To prevent double count near the poles, we also apply another window
# operation
var = ClimaAnalysis.window(
var,
"longitude",
right = length(lons) - 1,
by = ClimaAnalysis.Index(),
)

# Throw away the poles for latitudes
var = ClimaAnalysis.window(
var,
"latitude",
left = 2,
right = length(lats) - 1,
by = ClimaAnalysis.Index(),
)

# Want the covariance matrix to be Float32
var = ClimaCalibrate.ObservationRecipe.change_data_type(var, Float32)
return var
end

if abspath(PROGRAM_FILE) == @__FILE__
# covar_estimator =
# ClimaCalibrate.ObservationRecipe.SeasonalDiagonalCovariance(;
# model_error_scale = Float32(0.05),
# regularization = Float32(25.0),
# ignore_nan = true,
# use_latitude_weights = true,
# min_cosd_lat = 0.1
# )
covar_estimator =
ClimaCalibrate.ObservationRecipe.ScalarCovariance(;
scalar = 25.0,
use_latitude_weights = true,
min_cosd_lat = 0.1,
)
(; nelements, sample_date_ranges, short_names) = get_config()
@info "The number of samples is $(length(sample_date_ranges))"

observation_vector = make_era5_observation_vector(
covar_estimator,
short_names,
sample_date_ranges,
nelements,
)
JLD2.save_object(
"experiments/better_calibration/land_observation_vector.jld2",
observation_vector,
)
end
Loading