from marss2l.utils import setup_stream_logger
from huggingface_hub import hf_file_system
from marss2l.huggingface import REPO_ID
fs = hf_file_system.HfFileSystem()
Install marss2l package
pip install marss2l
CloudSEN12+ Experiment
We downloaded all cloud free images from the CloudSEN12+ dataset available here. For each of the 10,440 images we downloaded the corresponding 200x200 Sentinel-2 L1C image and the most similar cloud-free image from a time window of 8 months. We found a cloud free reference for all images except for 5. We compute the cloud masks of these images using the CloudSEN12 model and fetched the wind data from ERA5Land or NASA/GEOS/FP if the location is close to the coast line.
import pandas as pd
import os
csv_path_cloudsen12 = f"datasets/{REPO_ID}/cloudsen12_clear_images.csv"
with fs.open(csv_path_cloudsen12, "r") as fh:
dd = pd.read_csv(fh)
print(f"Number of images: {dd.shape[0]}")
print(f"Date of acquisition of the images: {dd.tile_date.min()} {dd.tile_date.max()}")
os.makedirs("figures", exist_ok=True)
Geographical distribution
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots(1,1, figsize=(10,6))
ocean_color = (plt.get_cmap('ocean'))(210)
land_color = plt.get_cmap('gist_earth')(200)
land_color = "#ededed"
# lon_0 is central longitude of projection.
# resolution = 'c' means use crude resolution coastlines.
m = Basemap(projection='moll',lon_0=0,resolution='c',ax=ax)
m.drawcoastlines()
m.fillcontinents(color=land_color,lake_color=ocean_color)
# draw parallels and meridians.
m.drawparallels(np.arange(-90.,120.,30.))
m.drawmeridians(np.arange(0.,420.,60.))
m.drawmapboundary(fill_color=ocean_color)
x, y = m(dd.lon,dd.lat)
m.scatter(x,y,1,marker='o',color='C3')
ax.set_title("CloudSEN12+")
plt.savefig("figures/cloudsen12plus_locs.pdf")
Fig X Distribution of clear images used to estimate the global FPR
Results in marss2l
All data exported is available in the same format as the MARS-S2L dataset at HuggingFace. We can run the eval script in this data with the following commands:
# Run eval in the CloudSEN12 dataset MARS S2L model
python -m marss2l.eval_final \
--split 'no split' \
--output_dir trained_models/MARSS2L_20250326 \
--device cpu \
--num_workers 16\
--suffix_output cloudsen12\
--csv_path /path/to/localdir/MARS-S2L/cloudsen12_clear_images.csv
python -m marss2l.eval_final \
--split 'no split' \
--output_dir trained_models/CH4Net_20250329 \
--device cpu \
--num_workers 16\
--suffix_output cloudsen12\
--csv_path /path/to/localdir/MARS-S2L/cloudsen12_clear_images.csv
``
The next cell shows the results of these evals:
</div>
</div>
</div>
<div class="cell border-box-sizing code_cell rendered" markdown="1">
<div class="input">
```python
import pandas as pd
results_ch4net_csv = f"datasets/{REPO_ID}/trained_models/CH4Net_20250329/preds_no splitcloudsen12th100.csv"
results_marss2l_csv = f"datasets/{REPO_ID}/trained_models/MARSS2L_20250326/preds_no splitcloudsen12th100.csv"
results_mbmp_csv = f"datasets/{REPO_ID}/trained_models/MBMP/preds_no splitcloudsen12th100.csv"
with fs.open(results_marss2l_csv, "r") as fh:
d = pd.read_csv(fh)
d["isplumepred"] = d.scene_pred > 0.5
fpr = d.isplumepred.sum()/d.shape[0] * 100
print(f"MARS-S2L -> FPR: {fpr:.1f}% Total positive: {d.isplumepred.sum()} Total: {d.shape[0]}")
with fs.open(results_ch4net_csv, "r") as fh:
dch4 = pd.read_csv(fh)
dch4["isplumepred"] = dch4.scene_pred > 0.5
fprch4 = dch4.isplumepred.sum()/dch4.shape[0] * 100
print(f"CH4Net -> FPR: {fprch4:.1f}% Total positive: {dch4.isplumepred.sum()} Total: {dch4.shape[0]}")
with fs.open(results_mbmp_csv, "r") as fh:
dmbmp = pd.read_csv(fh)
threshold_mbmp = -.985
dmbmp["isplumepred"] = dmbmp.scene_pred > threshold_mbmp
fprch4 = dmbmp.isplumepred.sum()/dmbmp.shape[0] * 100
print(f"MBMP -> FPR: {fprch4:.1f}% Total positive: {dmbmp.isplumepred.sum()} Total: {dmbmp.shape[0]}")
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,3, figsize=(15,5))
dmbmp["scene_pred"].hist(ax=ax[0])
dch4["scene_pred"].hist(ax=ax[1])
d["scene_pred"].hist(ax=ax[2])
ax[2].set_title("Train")
plt.suptitle("Distribution of model scores in CloudSEN12 dataset")
from marss2l.metrics import get_scenelevel_metrics, get_pixellevel_metrics
threshold_marss2l = 0.5
threshold_mbmp = -0.99
mets = []
for threshold_marss2l, threshold_mbmp in zip([0.5, 0.9, 0.98],[threshold_mbmp, -0.9, -0.85]):
for model, dg in zip(["MBMP", "CH4Net", "MARS-S2L"],[dmbmp,dch4,d]):
threshold = threshold_marss2l if not model.startswith("MBMP") else threshold_mbmp
dg["isplumenum"] = 0
mets_iter = get_scenelevel_metrics(dg.scene_pred, dg.isplumenum, threshold=threshold,
as_percentage=True)
mets_seg = get_pixellevel_metrics(TP=dg.TP, TN=dg.TN, FP=dg.FP, FN=dg.FN,
as_percentage=True)
mets_iter.update(mets_seg)
mets_iter.update({"nsamples": dg.shape[0],
"nlocs": dg.location_name.nunique(),
"nplumes": 0,
"model_name": model,
"threshold": threshold})
mets.append(mets_iter)
mets = pd.DataFrame(mets)# .sort_values(["balanced_accuracy"], ascending=False)
overall_mets = mets[["model_name"]+[c for c in mets.columns if c != "model_name"]].copy()
overall_mets[["model_name","threshold","fpr"]]
print(overall_mets[["model_name","threshold","fpr"]].to_latex(index=False, float_format="%.2f"))
\begin{table} \centering \footnotesize \begin{tabular}{lrrrl} \toprule & MARS-S2L & CH4Net & MBMP & Total Samples\ \midrule FPR (0.5) & 9.4\% & 2.0\% & 87.3\% & 10,434 \ FPR (0.5) & 9.4\% & 2.0\% & 87.3\% & 10,434 \ \bottomrule \end{tabular} \caption{False positive rate } \label{tab:fprcloudsen12} \end{table}