Data distribution in the dataset
Install marss2l package
pip install marss2l
import matplotlib
from marss2l.utils import setup_stream_logger
import logging
import os
from marss2l import plot
from marss2l.plot.colors import C0, C1, C2, C3, C4
from huggingface_hub import hf_file_system
from marss2l.huggingface import REPO_ID
fs = hf_file_system.HfFileSystem()
logger = logging.getLogger(__name__)
setup_stream_logger(logger)
os.makedirs("figures", exist_ok=True)
from marss2l import loaders
csv_path = f"datasets/{REPO_ID}/validated_images_all.csv"
dataframe_data_traintest = loaders.read_csv(csv_path, add_columns_for_analysis=True, fs=fs, split="all")
dataframe_data_traintest[dataframe_data_traintest.isplume & dataframe_data_traintest.window_col_off.isna()]
Stats splits dataset
stats_time = dataframe_data_traintest.groupby("split_name")["tile_date"].agg(["count", "min", "max"]).rename(columns={"count": "nimages", "min": "min date", "max": "max date"})
stats_plumes = dataframe_data_traintest.groupby("split_name")["isplume"].sum()
stats_locs = dataframe_data_traintest.groupby("split_name")["location_name"].nunique()
stats_time["nplumes"] = stats_plumes
stats_time["nlocs"] = stats_locs
stats_time.loc[["train_2023", "val_2023", "test_2023"], ["nimages", "nplumes","nlocs", "min date", "max date"]].reset_index()
Figure wind speed
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
dataframe_data_traintest_indexed = dataframe_data_traintest.set_index("id_loc_image")
valimages_df = dataframe_data_traintest_indexed.loc[dataframe_data_traintest_indexed.split_name.isin(["train_2023", "val_2023", "test_2023"])]
valplumes_df = valimages_df[valimages_df.isplume].copy()
fig, ax = plt.subplots(2,1,figsize=(6,3.5),sharex=True, tight_layout=True)
bins = [0,] + np.arange(0.5,valplumes_df.wind_speed.max()+.5,1).tolist()
bins_centers = np.arange(0,valplumes_df.wind_speed.max()+.5,1)
sns.histplot(x="wind_speed",data=valimages_df, bins=bins,color=C1,
cumulative=False,common_norm=False,ax=ax[0])
sns.histplot(x="wind_speed",data=valimages_df[valimages_df.isplume], bins=bins,
color=C1,
cumulative=False,common_norm=False,ax=ax[1])
ax[1].set_xticks(bins_centers)
ax[1].set_ylabel("plumes")
ax[0].set_ylabel("images")
ax[1].set_xlabel("Wind speed (m/s)")
fig.savefig("figures/stats_by_wind.pdf")
Figure area of the plume
from shapely import from_wkt
import geopandas as gpd
import rasterio.warp
from georeader import get_utm_epsg
from shapely.geometry import shape
dataframe_data_traintest_indexed = gpd.GeoDataFrame(dataframe_data_traintest_indexed,
geometry=dataframe_data_traintest_indexed.plume,
crs="EPSG:4326")
def compute_area(geometry, crs_geometry:str="EPSG:4326") -> float:
"""
Compute the area of a geometry in square meters by projecting it to UTM.
Args:
geometry (Geometry): geometry to compute the area
crs_geometry (_type_, optional): CRS of the geometry. Defaults to "EPSG:4326".
Returns:
float: area of the geometry in square meters
"""
center = geometry.centroid
utm_crs = get_utm_epsg(center)
geometry_utm = shape(rasterio.warp.transform_geom(crs_geometry,
utm_crs, geometry))
return geometry_utm.area
dataframe_data_traintest_indexed["area_meters"] = 0
plumes_good_index = dataframe_data_traintest_indexed.isplume & ~dataframe_data_traintest_indexed["geometry"].is_empty
dataframe_data_traintest_indexed.loc[plumes_good_index, "area_meters"] = dataframe_data_traintest_indexed.loc[plumes_good_index, "geometry"].apply(lambda x: compute_area(x))
dataframe_data_traintest_indexed["area_km"] = dataframe_data_traintest_indexed["area_meters"] / (1000**2)
valimages_df = dataframe_data_traintest_indexed.loc[dataframe_data_traintest_indexed.split_name.isin(["train_2023", "val_2023", "test_2023"])]
valplumes_df = valimages_df[valimages_df.isplume].copy()
sns.histplot(x="area_meters",data=valplumes_df,color=C1,
cumulative=False,common_norm=False)
import numpy as np
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
threshold_pixels = 100 * 10**2
fig, ax = plt.subplots(1,1,figsize=(6,3))
ax.hist(valplumes_df["area_meters"],
bins=50,
# bins=list(range(0,10_000,500))+list(range(10_000,100_000,5_000)),
zorder=2, alpha=0.8, linewidth=1, edgecolor="black",color=C1)
# ax.set_xticks(np.arange(0,100_000,4000), [f"{l:.0f}"for l in np.arange(0,100_000,4000)/1000])
ax.set_xlabel("Area (m²)")
ax.set_ylabel("plumes")
ax.grid(axis="y")
# axins = zoomed_inset_axes(ax, 3, loc=1) # zoom = 6
axins = plt.axes([.3, .6, .65, .3])
axins.hist(valplumes_df["area_meters"],
# bins=list(range(0,10_000,500))+list(range(10_000,100_000,5_000)),
bins=list(range(0,1000*10**2,10**3)),
zorder=2, alpha=0.8, linewidth=1, edgecolor="black",color=C1)
axins.vlines(x=threshold_pixels, ymin=0, ymax=40, colors=C2)
# axins.set_xticks(np.arange(0,8_500,500), [f"{l:.1f}" if f"{l:.1f}"[-1] == "5" else f"{l:.0f}" for l in np.arange(0,8_500,500)/1000])
# axins.set_yticks(range(0,400, 100))
# axins.set_xlabel("Flux rate (t/h)")
# axins.set_ylabel("# plumes")
axins.grid(axis="y")
mark_inset(ax, axins, loc1=3, loc2=4, fc="none", ec="0.5")
fig.tight_layout()
fig.savefig("figures/areaplume.pdf")
nplumes_smaller = (valplumes_df.area_meters < threshold_pixels).sum()
print(f"There are {nplumes_smaller} of {valplumes_df.shape[0]} plumes with less than {threshold_pixels} m². {nplumes_smaller/valplumes_df.shape[0] * 100:.2f}%")
# With threshold 50 instead of 100
threshold = 50 *10**2
nplumes_smaller = (valplumes_df.area_meters < threshold).sum()
print(f"There are {nplumes_smaller} of {valplumes_df.shape[0]} plumes with less than {threshold} m². {nplumes_smaller/valplumes_df.shape[0] * 100:.2f}%")