Skip to content

Data distribution in the dataset

Install marss2l package

pip install marss2l
import matplotlib
from marss2l.utils import setup_stream_logger
import logging
import os
from marss2l import plot
from marss2l.plot.colors import C0, C1, C2, C3, C4
from huggingface_hub import hf_file_system
from marss2l.huggingface import REPO_ID

fs = hf_file_system.HfFileSystem()

logger = logging.getLogger(__name__)
setup_stream_logger(logger)

os.makedirs("figures", exist_ok=True)
/home/gonzalo/mambaforge/envs/marss2ltacopy312/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

from marss2l import loaders

csv_path = f"datasets/{REPO_ID}/validated_images_all.csv" 
dataframe_data_traintest = loaders.read_csv(csv_path, add_columns_for_analysis=True, fs=fs, split="all")
dataframe_data_traintest[dataframe_data_traintest.isplume & dataframe_data_traintest.window_col_off.isna()]
id_loc_image s2path plumepath cloudmaskpath ch4path wind_u wind_v vza sza percentage_clear ... year_month year_month_day wind_speed isplumeneg date satellite_constellation year_quarter ch4_fluxrate_th interval_ch4_fluxrate interval_ch4_fluxrate_str
24027 ed298622-838d-43bc-a520-974f870c947c data/test_2023/4/dd0db738-d2ed-4878-bad1-e8e5f... data/test_2023/4/dd0db738-d2ed-4878-bad1-e8e5f... data/test_2023/4/dd0db738-d2ed-4878-bad1-e8e5f... data/test_2023/4/dd0db738-d2ed-4878-bad1-e8e5f... 0.848117 3.266057 10.003118 52.143978 100.0000 ... 2024-11-01 2024-11-13 3.374379 False 2024-11-13 Sentinel-2 2024-4Q 0.634903 (0.5, 1.0] (0.5, 1]
46411 1f8549f9-93f5-4a49-acf8-985dd67f7509 data/train_2023/97/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/97/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/97/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/97/1c0e8211-3dc6-42bc-9532-3a9... 0.997565 -1.752183 3.389330 34.724300 99.4625 ... 2023-09-01 2023-09-27 2.016254 False 2023-09-27 Sentinel-2 2023-3Q 2.568378 (2.5, 3.0] (2.5, 3]
46415 3af54601-2eb0-4b03-af23-fbc8b2f02ab0 data/train_2023/36/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/36/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/36/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/36/1c0e8211-3dc6-42bc-9532-3a9... -0.526581 5.128983 3.020000 26.950000 100.0000 ... 2023-08-01 2023-08-17 5.155943 False 2023-08-17 Landsat 2023-3Q 22.921395 (20.0, 999.0] >20
46417 c246c1f5-ffd9-4a2c-888c-96d762634c1a data/train_2023/42/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/42/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/42/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/42/1c0e8211-3dc6-42bc-9532-3a9... 3.667725 -4.098877 4.349882 19.134449 100.0000 ... 2023-07-01 2023-07-19 5.500272 False 2023-07-19 Sentinel-2 2023-3Q 3.690047 (3.0, 4.0] (3, 4]
46419 f03af98f-dff1-4269-9a4c-2a683f804090 data/train_2023/80/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/80/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/80/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/80/1c0e8211-3dc6-42bc-9532-3a9... 3.327713 -5.182068 4.353991 18.134721 99.4275 ... 2023-07-01 2023-07-09 6.158531 False 2023-07-09 Sentinel-2 2023-3Q 9.819270 (9.0, 10.0] (9, 10]
46421 b0f0570d-48e8-4789-9451-1bfef59ec4ba data/train_2023/18/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/18/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/18/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/18/1c0e8211-3dc6-42bc-9532-3a9... 5.591415 -5.645346 3.490000 20.780000 100.0000 ... 2023-06-01 2023-06-14 7.945681 False 2023-06-14 Landsat 2023-2Q 2.612090 (2.5, 3.0] (2.5, 3]
46429 30866940-9f13-4629-a7d2-a51c27337d3f data/train_2023/43/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/43/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/43/1c0e8211-3dc6-42bc-9532-3a9... data/train_2023/43/1c0e8211-3dc6-42bc-9532-3a9... 5.212448 -4.143068 4.287291 45.974223 100.0000 ... 2023-02-01 2023-02-19 6.658426 False 2023-02-19 Sentinel-2 2023-1Q 13.063924 (10.0, 15.0] (10, 15]
55456 64060392-0e1a-4895-8d4d-2645247345b7 data/test_2023/99/8a992420-625e-48b3-b802-7f6b... data/test_2023/99/8a992420-625e-48b3-b802-7f6b... data/test_2023/99/8a992420-625e-48b3-b802-7f6b... data/test_2023/99/8a992420-625e-48b3-b802-7f6b... 1.362807 -2.426667 1.840000 31.850000 100.0000 ... 2024-09-01 2024-09-11 2.783155 False 2024-09-11 Landsat 2024-3Q 3.278658 (3.0, 4.0] (3, 4]
55469 8649d9eb-6393-4fa7-bc9e-89dd313b3e13 data/test_2023/39/8a992420-625e-48b3-b802-7f6b... data/test_2023/39/8a992420-625e-48b3-b802-7f6b... data/test_2023/39/8a992420-625e-48b3-b802-7f6b... data/test_2023/39/8a992420-625e-48b3-b802-7f6b... 3.239486 -3.292131 1.860000 24.570000 100.0000 ... 2024-08-01 2024-08-02 4.618701 False 2024-08-02 Landsat 2024-3Q 1.505539 (1.5, 2.0] (1.5, 2]

9 rows × 58 columns

Stats splits dataset

stats_time = dataframe_data_traintest.groupby("split_name")["tile_date"].agg(["count", "min", "max"]).rename(columns={"count": "nimages", "min": "min date", "max": "max date"})
stats_plumes = dataframe_data_traintest.groupby("split_name")["isplume"].sum()
stats_locs = dataframe_data_traintest.groupby("split_name")["location_name"].nunique()
stats_time["nplumes"] = stats_plumes
stats_time["nlocs"] = stats_locs

stats_time.loc[["train_2023", "val_2023", "test_2023"], ["nimages", "nplumes","nlocs", "min date", "max date"]].reset_index()
split_name nimages nplumes nlocs min date max date
0 train_2023 38345 3512 618 2018-01-01 09:13:51+00:00 2023-11-30 10:33:09+00:00
1 val_2023 6018 299 89 2021-01-01 07:03:09+00:00 2021-12-31 10:03:23.429000+00:00
2 test_2023 43524 1832 1289 2024-01-01 00:05:22.713000+00:00 2024-12-31 17:27:23.051000+00:00

Figure wind speed

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

dataframe_data_traintest_indexed = dataframe_data_traintest.set_index("id_loc_image")
valimages_df = dataframe_data_traintest_indexed.loc[dataframe_data_traintest_indexed.split_name.isin(["train_2023", "val_2023", "test_2023"])]

valplumes_df = valimages_df[valimages_df.isplume].copy()

fig, ax = plt.subplots(2,1,figsize=(6,3.5),sharex=True, tight_layout=True)

bins = [0,] + np.arange(0.5,valplumes_df.wind_speed.max()+.5,1).tolist()
bins_centers = np.arange(0,valplumes_df.wind_speed.max()+.5,1)
sns.histplot(x="wind_speed",data=valimages_df, bins=bins,color=C1,
            cumulative=False,common_norm=False,ax=ax[0])
sns.histplot(x="wind_speed",data=valimages_df[valimages_df.isplume], bins=bins,
             color=C1,
            cumulative=False,common_norm=False,ax=ax[1])
ax[1].set_xticks(bins_centers)
ax[1].set_ylabel("plumes")
ax[0].set_ylabel("images")
ax[1].set_xlabel("Wind speed (m/s)")
fig.savefig("figures/stats_by_wind.pdf")
No description has been provided for this image

Figure area of the plume

from shapely import from_wkt
import geopandas as gpd
import rasterio.warp
from georeader import get_utm_epsg
from shapely.geometry import shape

dataframe_data_traintest_indexed = gpd.GeoDataFrame(dataframe_data_traintest_indexed, 
                                                    geometry=dataframe_data_traintest_indexed.plume,
                                                    crs="EPSG:4326")

def compute_area(geometry, crs_geometry:str="EPSG:4326") -> float:
    """
    Compute the area of a geometry in square meters by projecting it to UTM.

    Args:
        geometry (Geometry): geometry to compute the area
        crs_geometry (_type_, optional): CRS of the geometry. Defaults to "EPSG:4326".

    Returns:
        float: area of the geometry in square meters
    """
    center = geometry.centroid
    utm_crs = get_utm_epsg(center)
    geometry_utm = shape(rasterio.warp.transform_geom(crs_geometry, 
                                                      utm_crs, geometry))
    return geometry_utm.area


dataframe_data_traintest_indexed["area_meters"] = 0
plumes_good_index = dataframe_data_traintest_indexed.isplume & ~dataframe_data_traintest_indexed["geometry"].is_empty
dataframe_data_traintest_indexed.loc[plumes_good_index, "area_meters"] = dataframe_data_traintest_indexed.loc[plumes_good_index, "geometry"].apply(lambda x: compute_area(x))
dataframe_data_traintest_indexed["area_km"] = dataframe_data_traintest_indexed["area_meters"] / (1000**2)
valimages_df = dataframe_data_traintest_indexed.loc[dataframe_data_traintest_indexed.split_name.isin(["train_2023", "val_2023", "test_2023"])]
valplumes_df = valimages_df[valimages_df.isplume].copy()
/tmp/ipykernel_193038/1115262240.py:31: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[ 88661.64262182 119200.62052721 204363.39520962 ... 116100.00000028
  21645.35776812  28835.85959053]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dataframe_data_traintest_indexed.loc[plumes_good_index, "area_meters"] = dataframe_data_traintest_indexed.loc[plumes_good_index, "geometry"].apply(lambda x: compute_area(x))

sns.histplot(x="area_meters",data=valplumes_df,color=C1,
            cumulative=False,common_norm=False)
<Axes: xlabel='area_meters', ylabel='Count'>
No description has been provided for this image
import numpy as np
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset

threshold_pixels = 100 * 10**2
fig, ax = plt.subplots(1,1,figsize=(6,3))
ax.hist(valplumes_df["area_meters"], 
        bins=50,
         # bins=list(range(0,10_000,500))+list(range(10_000,100_000,5_000)),
         zorder=2, alpha=0.8, linewidth=1, edgecolor="black",color=C1)
# ax.set_xticks(np.arange(0,100_000,4000), [f"{l:.0f}"for l in np.arange(0,100_000,4000)/1000])
ax.set_xlabel("Area (m²)")
ax.set_ylabel("plumes")
ax.grid(axis="y")

# axins = zoomed_inset_axes(ax, 3, loc=1) # zoom = 6
axins = plt.axes([.3, .6, .65, .3])
axins.hist(valplumes_df["area_meters"], 
         # bins=list(range(0,10_000,500))+list(range(10_000,100_000,5_000)),
         bins=list(range(0,1000*10**2,10**3)),
         zorder=2, alpha=0.8, linewidth=1, edgecolor="black",color=C1)
axins.vlines(x=threshold_pixels, ymin=0, ymax=40, colors=C2)
# axins.set_xticks(np.arange(0,8_500,500), [f"{l:.1f}" if f"{l:.1f}"[-1] == "5" else f"{l:.0f}" for l in np.arange(0,8_500,500)/1000])
# axins.set_yticks(range(0,400, 100))
# axins.set_xlabel("Flux rate (t/h)")
# axins.set_ylabel("# plumes")
axins.grid(axis="y")

mark_inset(ax, axins, loc1=3, loc2=4, fc="none", ec="0.5")

fig.tight_layout()
fig.savefig("figures/areaplume.pdf")
/tmp/ipykernel_193038/1612739842.py:31: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  fig.tight_layout()

No description has been provided for this image
nplumes_smaller = (valplumes_df.area_meters &lt; threshold_pixels).sum()
print(f"There are {nplumes_smaller} of {valplumes_df.shape[0]} plumes with less than {threshold_pixels} m². {nplumes_smaller/valplumes_df.shape[0] * 100:.2f}%")
There are 30 of 5643 plumes with less than 10000 m². 0.53%

# With threshold 50 instead of 100
threshold = 50 *10**2
nplumes_smaller = (valplumes_df.area_meters &lt; threshold).sum()
print(f"There are {nplumes_smaller} of {valplumes_df.shape[0]} plumes with less than {threshold} m². {nplumes_smaller/valplumes_df.shape[0] * 100:.2f}%")
There are 1 of 5643 plumes with less than 5000 m². 0.02%