Synthetic Tests

[1]:
import numpy as np
import pandas as pd
import requests
import io
from scipy import interpolate
import matplotlib.pyplot as plt
from cartopy import crs as ccrs, feature as cfeature
import json
import ast

from gen_synth_data import gen_data, distort_data

from sedprep.kalman_filter import Filter
from sedprep.utils import lif
from sedprep.data_handling import add_identifier, read_arch_data, clean_data
from sedprep.constants import field_params
from sedprep.plot_functions import plot_DIF

folder = "../dat"
cpu
[2]:
np.random.seed(1000)
new_sample_from_prior = False    # set to True to generate a new reference model
# We use the temporal and spartial distribution of real archaeo data and the sed_data_KLK.csv
pre = "https://nextcloud.gfz-potsdam.de/s/"
rejection_response = requests.get(f"{pre}WLxDTddq663zFLP/download")
rejection_response.raise_for_status()
with np.load(io.BytesIO(rejection_response.content), allow_pickle=True) as fh:
    to_reject = fh['to_reject']
data_response = requests.get(f"{pre}r6YxrrABRJjideS/download")
arch_real = read_arch_data(io.BytesIO(data_response.content), to_reject)

# arch_real = pd.read_csv("../dat/synth_data/arch.csv")

sed_real = pd.read_csv("../dat/P3.csv")

knots, dt = np.linspace(
    min(arch_real["t"]), max(arch_real["t"]), 2000, retstep=True
)
knots = np.concatenate((
    [min(knots) - 2], [min(knots) - 1], knots, [max(knots) + 1], [max(knots) + 2]
))
if new_sample_from_prior:
    model = Filter(**field_params)
    prior_sample = model.sample_prior(ts=knots, n_samps=10)[0, :, :]
    np.save("../dat/prior_sample.npy", prior_sample)
else:
    prior_sample = np.load("../dat/prior_sample.npy")
vals = np.insert(prior_sample.T, [prior_sample.T.shape[0], 0], [prior_sample.T[-1], prior_sample.T[0]], axis=0)
spline = interpolate.BSpline(knots, vals, 1)
Rejected 347 outliers.
/opt/conda/envs/virtualenv/lib/python3.12/site-packages/sedprep/data_handling.py:553: UserWarning: Records with indices [   72    73    74    75    76    77    78    79    80    81   108   109
   357   358  1065  1066  1153  2402  2575  2671  2753  2756  2757  3036
  3215  3217  3218  3219  3220  3221  3222  3223  3224  3225  3503  3772
  3988  4047  4235  4415  4504  4530  4531  4655  5172  5326  5434  5496
  5551  5707  5763  5802  5805  5959  6053  6105  6647  6706  6781  6809
  7149  7580  7585  7593  7692  7702  7735  7776  7806  7826  7881  7979
  8067  8296  8327  8427  8464  8510  8670  8708  8752  9224  9653 10002
 10132 10331 10509 10604 10647 11527 11758] contain declination, but not inclination! The errors need special treatment!
To be able to use the provided data, these records have been dropped from the output.
[3]:
np.random.seed(1000)
gen_new_arch_data = False        # set to True to generate new synthetic arch data

if gen_new_arch_data:
    arch, _ = gen_data(
        t=arch_real["t"],       # use temporal distribution of real arch data
        lat=arch_real["lat"],   # use spatial distribution of real arch data
        lon=arch_real["lon"],   # use spatial distribution of real arch data
        field_spline=spline,    # use previously generated prior sample
        dt=arch_real["dt"],     # use temporal errors of real arch data
        ddec=arch_real["dD"],   # use measurement errors of real arch data
        dinc=arch_real["dI"],   # use measurement errors of real arch data
        dint=arch_real["dF"],   # use measurement errors of real arch data
        depth=None,             # arch data does not has depth
        with_noise=True,        # add some noise to synthetic arch data
    )
    arch["type"] = "arch_data"
    # # To use your new data store it as a csv file using
    arch.to_csv(f"{folder}/arch.csv", index=False)
    # # Then run the following line to add unique identifiers
    add_identifier(f"{folder}/arch.csv")
else:
    arch = pd.read_csv(f"{folder}/arch.csv")
[4]:
np.random.seed(1000)
gen_new_ref_sed_data = False     # set to True to generate new reference sediment data

ref_sed_dfs = {}
ref_sed_NEZs = {}
lats = {"sweden": sed_real["lat"][0], "rapa": -27.616}
lons = {"sweden": sed_real["lon"][0], "rapa": -144.283}
for loc in ["sweden", "rapa"]:
    if gen_new_ref_sed_data:
        ref_sed, ref_sed_NEZ = gen_data(
            t=sed_real["t"],        # use temporal distribution of real sed data
            lat=lats[loc],          # use latitude distribution of real sed data (-27.616 for rapa)
            lon=lons[loc],          # use longitude distribution of real sed data (-144.283 for rapa)
            field_spline=spline,    # use previously generated prior sample
            dt=sed_real["dt"],      # use temporal errors of real sed data
            depth=sed_real["depth"],# use depth of real sed data
            with_noise=False,       # no noise for reference process
        )
        # Store the data as a csv file using
        ref_sed.to_csv(f"{folder}/ref_sed_{loc}.csv", index=False)
        np.save(f"{folder}/ref_sed_{loc}_NEZ.npy", ref_sed_NEZ)
        ref_sed_NEZs[loc] = np.load(f"{folder}/ref_sed_{loc}_NEZ.npy")
        ref_sed_dfs[loc] = pd.read_csv(f"{folder}/ref_sed_{loc}.csv")
    else:
        ref_sed_NEZs[loc] = np.load(f"{folder}/ref_sed_{loc}_NEZ.npy")
        ref_sed_dfs[loc] = pd.read_csv(f"{folder}/ref_sed_{loc}.csv")
[5]:
np.random.seed(1000)
gen_new_sed_data = False  # set to True to generate new synthetic sed data

# since sed_real has no intensity errors we use the following
dint = np.random.normal(4, 2, len(sed_real)).clip(0.1, None)
# add two subsections
subsections={"A1": np.arange(0, 60, 1), "A2": np.arange(60, len(ref_sed_dfs["sweden"]), 1)}
bs_DIs = {"triangle": [10, 10, 0, 10], "square": [5, 1, 20, 1], "ramp": [1, 1, 0, 20]}
bs_Fs = bs_DIs
# offset parameters for two sections
offsets = {"A1": 30., "A2": -10.}
f_shallow=0.6
cal_fac = 90

sed_dfs = {}
for loc in ["sweden", "rapa"]:
    ref_sed = ref_sed_dfs[loc]
    ref_sed_NEZ = ref_sed_NEZs[loc]
    for file in ["triangle", "square", "ramp"]:
        name = f"{loc}_{file}"
        if gen_new_sed_data:
            sed = distort_data(
                ref_sed_dfs[loc],
                ref_sed_NEZs[loc],
                ddec=sed_real["dD"],    # use measurement errors of real sed data
                dinc=sed_real["dI"],    # use measurement errors of real sed data
                dint=dint,              # measurement errors for intensity
                bs_DI=bs_DIs[file],     # lock-in function parameters for directions
                bs_F=bs_Fs[file],       # lock-in function parameters for intensity
                offsets=offsets,        # declination offsets per sections
                f_shallow=f_shallow,    # inclination shallowing factor
                cal_fac=cal_fac,        # calibration factor for intensity
                subsections=subsections,# subsections
            )
            sed["type"] = "sediments"
            sed.to_csv(f"{folder}/sed_{name}.csv", index=False)
            # Add unique identifiers
            add_identifier(f"{folder}/sed_{name}.csv")
            sed_dfs[name] = pd.read_csv(f"{folder}/sed_{name}.csv")
        else:
            sed_dfs[name] = pd.read_csv(f"{folder}/sed_{name}.csv")

[6]:
proj = ccrs.Mollweide()
fig = plt.figure(figsize=(10, 4), dpi=400)
arch_temp_dist = fig.add_subplot(2, 2, 1)
sed_temp_dist = fig.add_subplot(2, 2, 3)
sph_ax = fig.add_subplot(1, 2, 2, projection=proj)

arch_temp_dist.hist(arch.t, bins=50, color="C0")
arch_temp_dist.set_title("Temporal distribution")
sed_temp_dist.hist(ref_sed_dfs["rapa"].t, bins=50, color="C3")
sed_temp_dist.set_xlabel("Time [years]")

sph_ax.set_title("Spatial distribution")
sph_ax.set_global()
sph_ax.add_feature(cfeature.LAND, zorder=0, color="lightgray")
sph_ax.scatter(
    arch.lon,
    arch.lat,
    c="C0",
    label="Synthetic archaeo data (#" + str(int(len(arch))) + ")",
    s=5,
    alpha=1,
    zorder=1,
    rasterized=True,
    transform=ccrs.PlateCarree(),
)
for i, sed_data in enumerate(
    [ref_sed_dfs["sweden"], ref_sed_dfs["rapa"]]
):
    sph_ax.scatter(
        sed_data.lon[0],
        sed_data.lat[0],
        c="C3",
        label="Synthetic sediment data (#" + str(int(len(sed_data))) + " each)" if i == 0 else None,
        s=60,
        marker="*",
        alpha=1,
        zorder=1,
        rasterized=True,
        transform=ccrs.PlateCarree(),
    )
sph_ax.legend(loc="center", prop={"size": 9}, bbox_to_anchor=(0.5, -0.13), ncol=1)
fig.autofmt_xdate()
fig.tight_layout()
/opt/conda/envs/virtualenv/lib/python3.12/site-packages/cartopy/io/__init__.py:241: DownloadWarning: Downloading: https://naturalearth.s3.amazonaws.com/110m_physical/ne_110m_land.zip
../../_images/synthetic_tests_src_synthetic_tests_6_1.png
[7]:
preprocess_data = False     # set to True to apply estimated parameters to clean data
prep_dfs = {}
for loc in ["sweden", "rapa"]:
    for file in ["triangle", "square", "ramp"]:
        name = f"{loc}_{file}"
        sed = pd.read_csv(f"{folder}/sed_{name}.csv")
        res_DI = pd.read_csv(f"../results/sed_{name}_DI.csv")
        res_F = pd.read_csv(f"../results/sed_{name}_F.csv")
        est_bs_DI_4p = ast.literal_eval(res_DI[res_DI.optimizer=="scipy"].bs_DI.values[0])
        est_bs_F_4p = ast.literal_eval(res_F[res_F.optimizer=="scipy"].bs_F.values[0])
        est_bs_DI_2p = ast.literal_eval(res_DI[res_DI.optimizer=="scipy"].bs_DI.values[1])
        est_bs_F_2p = ast.literal_eval(res_F[res_F.optimizer=="scipy"].bs_F.values[1])
        est_offsets_4p = json.loads(res_DI[res_DI.optimizer=="scipy"].offsets.values[0].replace("'", '"'))
        est_offsets_2p = json.loads(res_DI[res_DI.optimizer=="scipy"].offsets.values[1].replace("'", '"'))
        est_f_shallow_4p = res_DI[res_DI.optimizer=="scipy"].f_shallow.values[0]
        est_f_shallow_2p = res_DI[res_DI.optimizer=="scipy"].f_shallow.values[1]
        est_cal_fac_4p = res_F[res_F.optimizer=="scipy"].cal_fac.values[0]
        est_cal_fac_2p = res_F[res_F.optimizer=="scipy"].cal_fac.values[1]
        if preprocess_data:
            prep_dfs[name + "_4p"] = clean_data(
                sed,
                est_bs_DI_4p,
                est_bs_F_4p,
                est_offsets_4p,
                est_f_shallow_4p,
                est_cal_fac_4p,
            )
            prep_dfs[name + "_4p"].to_csv(
                f"../results/preprocessed/sed_{name}_preprocessed_4p.csv", index=False
            )
            prep_dfs[name + "_2p"] = clean_data(
                sed,
                est_bs_DI_2p,
                est_bs_F_2p,
                est_offsets_2p,
                est_f_shallow_2p,
                est_cal_fac_2p,
            )
            prep_dfs[name + "_2p"].to_csv(
                f"../results/preprocessed/sed_{name}_preprocessed_2p.csv", index=False
            )
        else:
            prep_dfs[name + "_4p"] = pd.read_csv(f"../results/preprocessed/sed_{name}_preprocessed_4p.csv")
            prep_dfs[name + "_2p"] = pd.read_csv(f"../results/preprocessed/sed_{name}_preprocessed_2p.csv")
[8]:
lif_knots = np.linspace(0, 50, 1000)
props = dict(boxstyle='round,pad=0.6', facecolor='none', alpha=0.8)
for loc in ["sweden", "rapa"]:
    ref_sed = pd.read_csv(f"{folder}/ref_sed_{loc}.csv")
    for file in ["triangle", "square", "ramp"]:
        name = f"{loc}_{file}"
        sed = pd.read_csv(f"{folder}/sed_{name}.csv")
        res_DI = pd.read_csv(f"../results/sed_{name}_DI.csv")
        res_F = pd.read_csv(f"../results/sed_{name}_F.csv")
        est_bs_DI_4p = ast.literal_eval(res_DI[res_DI.optimizer=="scipy"].bs_DI.values[0])
        est_bs_F_4p = ast.literal_eval(res_F[res_F.optimizer=="scipy"].bs_F.values[0])
        est_bs_DI_2p = ast.literal_eval(res_DI[res_DI.optimizer=="scipy"].bs_DI.values[1])
        est_bs_F_2p = ast.literal_eval(res_F[res_F.optimizer=="scipy"].bs_F.values[1])
        est_offsets_4p = json.loads(res_DI[res_DI.optimizer=="scipy"].offsets.values[0].replace("'", '"'))
        est_offsets_2p = json.loads(res_DI[res_DI.optimizer=="scipy"].offsets.values[1].replace("'", '"'))
        est_f_shallow_4p = res_DI[res_DI.optimizer=="scipy"].f_shallow.values[0]
        est_f_shallow_2p = res_DI[res_DI.optimizer=="scipy"].f_shallow.values[1]
        est_cal_fac_4p = res_F[res_F.optimizer=="scipy"].cal_fac.values[0]
        est_cal_fac_2p = res_F[res_F.optimizer=="scipy"].cal_fac.values[1]
        fig, axs = plt.subplot_mosaic(
            2 * [3 * ["D"] + ["."]]
            + 2 * [3 * ["D"] + ["LIF_DI"]]
            + 2 * [3 * ["I"] + ["LIF_DI"]]
            + 2 * [3 * ["I"] + ["."]]
            + 4 * [3 * ["F"] + ["LIF_F"]], figsize=(12, 6), dpi=300)
        fig.align_ylabels()
        plt.subplots_adjust(wspace=0.5, hspace=0.2)
        axF_twin = axs["F"].twinx()
        for ax, comp in zip([axs["D"], axs["I"], axF_twin], ["D", "I", "F"]):
            ax.plot(ref_sed.depth, ref_sed[comp], c="C2", label="ref. process")
            ax.set_xticklabels([]) if comp != "F" else None
        plot_DIF(sed, axs["D"], axs["I"], axs["F"], time_or_depth="depth", xerr=True, yerr=True, legend=False)
        axs["D"].legend(bbox_to_anchor=(0, 0.98), loc="upper left", ncol=3)
        plot_DIF(prep_dfs[name + "_4p"], axs["D"], axs["I"], axF_twin, time_or_depth="depth", xerr=True, yerr=True, legend=False, distinguish_subs=False)
        plot_DIF(prep_dfs[name + "_2p"], axs["D"], axs["I"], axF_twin, time_or_depth="depth", xerr=True, yerr=True, legend=False, distinguish_subs=False, color="C4")
        axF_twin.set_ylabel("Intensity [uT]")
        for ax, bs, bs_4p, bs_2p in zip([axs["LIF_DI"], axs["LIF_F"]], [bs_DIs, bs_Fs], [est_bs_DI_4p, est_bs_F_4p], [est_bs_DI_2p, est_bs_F_2p]):
            ax.plot(lif_knots, lif(bs[file])(lif_knots), c="C2", lw=2, zorder=2, label="real")
            ax.plot(lif_knots, lif(bs_4p)(lif_knots), c="C0", lw=2, zorder=3, label="4p")
            ax.plot(lif_knots, lif(bs_2p)(lif_knots), c="C4", lw=2, zorder=3, label="2p")
            ax.set_yticks([], [])
        axs["LIF_DI"].legend(bbox_to_anchor=(0.5, 1.35), title="Lock-in functions", loc="center", ncol=3)
        offsets_text = r"$\bf{Declination\ offsets}$" + "\n"
        offsets_text += f"real: A1: {offsets['A1']:.2f}; A2: {offsets['A2']:.2f}\n"
        offsets_text += f"4p: A1: {est_offsets_4p['A1']:.2f}; A2: {est_offsets_4p['A2']:.2f}\n"
        offsets_text += f"2p: A1: {est_offsets_2p['A1']:.2f}; A2: {est_offsets_2p['A2']:.2f}"
        f_shallow_text = r"$\bf{Shallowing\ factor}$" + "\n"
        f_shallow_text += f"real: {f_shallow:.3f}\n"
        f_shallow_text += f"  4p: {est_f_shallow_4p:.3f}\n"
        f_shallow_text += f"  2p: {est_f_shallow_2p:.3f}"
        cal_fac_text = r"$\bf{Calibration\ factor}$" + "\n"
        cal_fac_text += f"real: {cal_fac:.2f}\n"
        cal_fac_text += f"  4p: {est_cal_fac_4p:.2f}\n"
        cal_fac_text += f"  2p: {est_cal_fac_2p:.2f}"
        axs["D"].text(-0.25, 0.5, offsets_text, transform=axs["D"].transAxes, va="center", ha="center", bbox=props)
        axs["I"].text(-0.25, 0.5, f_shallow_text, transform=axs["I"].transAxes, va="center", ha="center", bbox=props)
        axs["F"].text(-0.25, 0.5, cal_fac_text, transform=axs["F"].transAxes, va="center", ha="center", bbox=props)
        # fig.suptitle(f"Results for {name}", x=0, fontweight="bold")
        # plt.savefig(
        #     f"../../../paper_3/65c4d0ebd16cb35403bef57e/{name}.png",
        #     bbox_inches="tight",
        # )
../../_images/synthetic_tests_src_synthetic_tests_8_0.png
../../_images/synthetic_tests_src_synthetic_tests_8_1.png
../../_images/synthetic_tests_src_synthetic_tests_8_2.png
../../_images/synthetic_tests_src_synthetic_tests_8_3.png
../../_images/synthetic_tests_src_synthetic_tests_8_4.png
../../_images/synthetic_tests_src_synthetic_tests_8_5.png