Data distribution in the dataset

Install `marss2l` package

pip install marss2l

import matplotlib
from marss2l.utils import setup_stream_logger
import os
from marss2l import plot
from marss2l.plot.colors import C0, C1, C2, C3, C4
from huggingface_hub import hf_file_system
from marss2l.huggingface import REPO_ID

fs = hf_file_system.HfFileSystem()

logger = setup_stream_logger()

os.makedirs("figures", exist_ok=True)

/home/gonzalo/mambaforge/envs/marss2ltacopy312/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

from marss2l import loaders

csv_path = f"datasets/{REPO_ID}/validated_images_all.csv" 
dataframe_data_traintest = loaders.read_csv(csv_path, add_columns_for_analysis=True, fs=fs, split="all")

dataframe_data_traintest[dataframe_data_traintest.isplume &amp; dataframe_data_traintest.window_col_off.isna()]

	id_loc_image	s2path	plumepath	cloudmaskpath	ch4path	wind_u	wind_v	vza	sza	percentage_clear	...	year_month	year_month_day	wind_speed	isplumeneg	date	satellite_constellation	year_quarter	ch4_fluxrate_th	interval_ch4_fluxrate	interval_ch4_fluxrate_str
24027	ed298622-838d-43bc-a520-974f870c947c	data/test_2023/4/dd0db738-d2ed-4878-bad1-e8e5f...	data/test_2023/4/dd0db738-d2ed-4878-bad1-e8e5f...	data/test_2023/4/dd0db738-d2ed-4878-bad1-e8e5f...	data/test_2023/4/dd0db738-d2ed-4878-bad1-e8e5f...	0.848117	3.266057	10.003118	52.143978	100.0000	...	2024-11-01	2024-11-13	3.374379	False	2024-11-13	Sentinel-2	2024-4Q	0.634903	(0.5, 1.0]	(0.5, 1]
46411	1f8549f9-93f5-4a49-acf8-985dd67f7509	data/train_2023/97/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/97/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/97/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/97/1c0e8211-3dc6-42bc-9532-3a9...	0.997565	-1.752183	3.389330	34.724300	99.4625	...	2023-09-01	2023-09-27	2.016254	False	2023-09-27	Sentinel-2	2023-3Q	2.568378	(2.5, 3.0]	(2.5, 3]
46415	3af54601-2eb0-4b03-af23-fbc8b2f02ab0	data/train_2023/36/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/36/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/36/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/36/1c0e8211-3dc6-42bc-9532-3a9...	-0.526581	5.128983	3.020000	26.950000	100.0000	...	2023-08-01	2023-08-17	5.155943	False	2023-08-17	Landsat	2023-3Q	22.921395	(20.0, 999.0]	>20
46417	c246c1f5-ffd9-4a2c-888c-96d762634c1a	data/train_2023/42/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/42/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/42/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/42/1c0e8211-3dc6-42bc-9532-3a9...	3.667725	-4.098877	4.349882	19.134449	100.0000	...	2023-07-01	2023-07-19	5.500272	False	2023-07-19	Sentinel-2	2023-3Q	3.690047	(3.0, 4.0]	(3, 4]
46419	f03af98f-dff1-4269-9a4c-2a683f804090	data/train_2023/80/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/80/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/80/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/80/1c0e8211-3dc6-42bc-9532-3a9...	3.327713	-5.182068	4.353991	18.134721	99.4275	...	2023-07-01	2023-07-09	6.158531	False	2023-07-09	Sentinel-2	2023-3Q	9.819270	(9.0, 10.0]	(9, 10]
46421	b0f0570d-48e8-4789-9451-1bfef59ec4ba	data/train_2023/18/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/18/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/18/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/18/1c0e8211-3dc6-42bc-9532-3a9...	5.591415	-5.645346	3.490000	20.780000	100.0000	...	2023-06-01	2023-06-14	7.945681	False	2023-06-14	Landsat	2023-2Q	2.612090	(2.5, 3.0]	(2.5, 3]
46429	30866940-9f13-4629-a7d2-a51c27337d3f	data/train_2023/43/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/43/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/43/1c0e8211-3dc6-42bc-9532-3a9...	data/train_2023/43/1c0e8211-3dc6-42bc-9532-3a9...	5.212448	-4.143068	4.287291	45.974223	100.0000	...	2023-02-01	2023-02-19	6.658426	False	2023-02-19	Sentinel-2	2023-1Q	13.063924	(10.0, 15.0]	(10, 15]
55456	64060392-0e1a-4895-8d4d-2645247345b7	data/test_2023/99/8a992420-625e-48b3-b802-7f6b...	data/test_2023/99/8a992420-625e-48b3-b802-7f6b...	data/test_2023/99/8a992420-625e-48b3-b802-7f6b...	data/test_2023/99/8a992420-625e-48b3-b802-7f6b...	1.362807	-2.426667	1.840000	31.850000	100.0000	...	2024-09-01	2024-09-11	2.783155	False	2024-09-11	Landsat	2024-3Q	3.278658	(3.0, 4.0]	(3, 4]
55469	8649d9eb-6393-4fa7-bc9e-89dd313b3e13	data/test_2023/39/8a992420-625e-48b3-b802-7f6b...	data/test_2023/39/8a992420-625e-48b3-b802-7f6b...	data/test_2023/39/8a992420-625e-48b3-b802-7f6b...	data/test_2023/39/8a992420-625e-48b3-b802-7f6b...	3.239486	-3.292131	1.860000	24.570000	100.0000	...	2024-08-01	2024-08-02	4.618701	False	2024-08-02	Landsat	2024-3Q	1.505539	(1.5, 2.0]	(1.5, 2]

9 rows × 58 columns

Stats splits dataset

stats_time = dataframe_data_traintest.groupby("split_name")["tile_date"].agg(["count", "min", "max"]).rename(columns={"count": "nimages", "min": "min date", "max": "max date"})
stats_plumes = dataframe_data_traintest.groupby("split_name")["isplume"].sum()
stats_locs = dataframe_data_traintest.groupby("split_name")["location_name"].nunique()
stats_time["nplumes"] = stats_plumes
stats_time["nlocs"] = stats_locs

stats_time.loc[["train_2023", "val_2023", "test_2023"], ["nimages", "nplumes","nlocs", "min date", "max date"]].reset_index()

	split_name	nimages	nplumes	nlocs	min date	max date
0	train_2023	38345	3512	618	2018-01-01 09:13:51+00:00	2023-11-30 10:33:09+00:00
1	val_2023	6018	299	89	2021-01-01 07:03:09+00:00	2021-12-31 10:03:23.429000+00:00
2	test_2023	43524	1832	1289	2024-01-01 00:05:22.713000+00:00	2024-12-31 17:27:23.051000+00:00

Figure wind speed

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

dataframe_data_traintest_indexed = dataframe_data_traintest.set_index("id_loc_image")
valimages_df = dataframe_data_traintest_indexed.loc[dataframe_data_traintest_indexed.split_name.isin(["train_2023", "val_2023", "test_2023"])]

valplumes_df = valimages_df[valimages_df.isplume].copy()

fig, ax = plt.subplots(2,1,figsize=(6,3.5),sharex=True, tight_layout=True)

bins = [0,] + np.arange(0.5,valplumes_df.wind_speed.max()+.5,1).tolist()
bins_centers = np.arange(0,valplumes_df.wind_speed.max()+.5,1)
sns.histplot(x="wind_speed",data=valimages_df, bins=bins,color=C1,
            cumulative=False,common_norm=False,ax=ax[0])
sns.histplot(x="wind_speed",data=valimages_df[valimages_df.isplume], bins=bins,
             color=C1,
            cumulative=False,common_norm=False,ax=ax[1])
ax[1].set_xticks(bins_centers)
ax[1].set_ylabel("plumes")
ax[0].set_ylabel("images")
ax[1].set_xlabel("Wind speed (m/s)")
fig.savefig("figures/stats_by_wind.pdf")

No description has been provided for this image

Figure area of the plume

from shapely import from_wkt
import geopandas as gpd
import rasterio.warp
from georeader import get_utm_epsg
from shapely.geometry import shape

dataframe_data_traintest_indexed = gpd.GeoDataFrame(dataframe_data_traintest_indexed, 
                                                    geometry=dataframe_data_traintest_indexed.plume,
                                                    crs="EPSG:4326")

def compute_area(geometry, crs_geometry:str="EPSG:4326") -&gt; float:
    """
    Compute the area of a geometry in square meters by projecting it to UTM.

    Args:
        geometry (Geometry): geometry to compute the area
        crs_geometry (_type_, optional): CRS of the geometry. Defaults to "EPSG:4326".

    Returns:
        float: area of the geometry in square meters
    """
    center = geometry.centroid
    utm_crs = get_utm_epsg(center)
    geometry_utm = shape(rasterio.warp.transform_geom(crs_geometry, 
                                                      utm_crs, geometry))
    return geometry_utm.area


# Initialise as float: areas are floats, and pandas &gt;=3.0 raises instead of
# silently upcasting when float values are assigned into an int64 column.
dataframe_data_traintest_indexed["area_meters"] = 0.0
plumes_good_index = dataframe_data_traintest_indexed.isplume &amp; ~dataframe_data_traintest_indexed["geometry"].is_empty
dataframe_data_traintest_indexed.loc[plumes_good_index, "area_meters"] = dataframe_data_traintest_indexed.loc[plumes_good_index, "geometry"].apply(lambda x: compute_area(x))
dataframe_data_traintest_indexed["area_km"] = dataframe_data_traintest_indexed["area_meters"] / (1000**2)
valimages_df = dataframe_data_traintest_indexed.loc[dataframe_data_traintest_indexed.split_name.isin(["train_2023", "val_2023", "test_2023"])]
valplumes_df = valimages_df[valimages_df.isplume].copy()

sns.histplot(x="area_meters",data=valplumes_df,color=C1,
            cumulative=False,common_norm=False)

<Axes: xlabel='area_meters', ylabel='Count'>

import numpy as np
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset

threshold_pixels = 100 * 10**2
fig, ax = plt.subplots(1,1,figsize=(6,3))
ax.hist(valplumes_df["area_meters"], 
        bins=50,
         # bins=list(range(0,10_000,500))+list(range(10_000,100_000,5_000)),
         zorder=2, alpha=0.8, linewidth=1, edgecolor="black",color=C1)
# ax.set_xticks(np.arange(0,100_000,4000), [f"{l:.0f}"for l in np.arange(0,100_000,4000)/1000])
ax.set_xlabel("Area (m²)")
ax.set_ylabel("plumes")
ax.grid(axis="y")

# axins = zoomed_inset_axes(ax, 3, loc=1) # zoom = 6
axins = plt.axes([.3, .6, .65, .3])
axins.hist(valplumes_df["area_meters"], 
         # bins=list(range(0,10_000,500))+list(range(10_000,100_000,5_000)),
         bins=list(range(0,1000*10**2,10**3)),
         zorder=2, alpha=0.8, linewidth=1, edgecolor="black",color=C1)
axins.vlines(x=threshold_pixels, ymin=0, ymax=40, colors=C2)
# axins.set_xticks(np.arange(0,8_500,500), [f"{l:.1f}" if f"{l:.1f}"[-1] == "5" else f"{l:.0f}" for l in np.arange(0,8_500,500)/1000])
# axins.set_yticks(range(0,400, 100))
# axins.set_xlabel("Flux rate (t/h)")
# axins.set_ylabel("# plumes")
axins.grid(axis="y")

mark_inset(ax, axins, loc1=3, loc2=4, fc="none", ec="0.5")

fig.tight_layout()
fig.savefig("figures/areaplume.pdf")

/tmp/ipykernel_193038/1612739842.py:31: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  fig.tight_layout()

nplumes_smaller = (valplumes_df.area_meters &lt; threshold_pixels).sum()
print(f"There are {nplumes_smaller} of {valplumes_df.shape[0]} plumes with less than {threshold_pixels} m². {nplumes_smaller/valplumes_df.shape[0] * 100:.2f}%")

There are 30 of 5643 plumes with less than 10000 m². 0.53%

# With threshold 50 instead of 100
threshold = 50 *10**2
nplumes_smaller = (valplumes_df.area_meters &lt; threshold).sum()
print(f"There are {nplumes_smaller} of {valplumes_df.shape[0]} plumes with less than {threshold} m². {nplumes_smaller/valplumes_df.shape[0] * 100:.2f}%")

There are 1 of 5643 plumes with less than 5000 m². 0.02%

Data distribution in the dataset

Install marss2l package

Stats splits dataset

Figure wind speed

Figure area of the plume

Install `marss2l` package