Overall dataset stats
Install marss2l package
pip install marss2l
stats train/val/test split
%%time
import matplotlib
from marss2l.utils import setup_stream_logger
import logging
from huggingface_hub import hf_file_system
from marss2l.huggingface import REPO_ID
fs = hf_file_system.HfFileSystem()
logger = logging.getLogger(__name__)
setup_stream_logger(logger)
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'
C0 = "#648FFF"
C1 = "#785EF0"
C2 = "#DC267F"
C3 = "#FE6100"
C4 = "#FFB000" # #FFB000
import os
# fs = get_remote_filesystem()
os.makedirs("figures", exist_ok=True)
from marss2l import loaders
csv_path = f"datasets/{REPO_ID}/validated_images_all.csv"
dataframe_data_traintest = loaders.read_csv(csv_path, add_columns_for_analysis=True, fs=fs, split="all", add_case_study=True,
add_loc_type=True)
dataframe_data_traintest = dataframe_data_traintest[~dataframe_data_traintest.location_name.isin(loaders.LOCATIONS_CONTROL_RELEASES)].copy()
dataframe_data_traintest["country"] = dataframe_data_traintest.apply(lambda row: "Offshore" if row.offshore else row.country,
axis=1)
loctypedict = {
"FiLM": "Sites seen at training time",
"few samples": "Sites seen at training time",
"no samples": "Test only sites"
}
# Set the type of location ("Seen at training time" or "test only")
dataframe_data_traintest["loc_type_2"] = dataframe_data_traintest["loc_type"].apply(lambda x: loctypedict[x])
rename_splits = {"test_2023": "test", "train_2023": "train", "val_2023": "val"}
dataframe_data_traintest["split_name"] = dataframe_data_traintest["split_name"].apply(lambda x: rename_splits[x] if x in rename_splits else x)
dataframe_data_traintest_test_202324= dataframe_data_traintest[dataframe_data_traintest.split_name == "test"].copy()
summaries = dataframe_data_traintest.groupby("split_name")["isplume"].agg(["sum", "count"]).rename(columns={"sum": "nplumes", "count": "nimages"})
summaries["nlocs"] = dataframe_data_traintest.groupby("split_name")["location_name"].nunique()
summaries_dates = dataframe_data_traintest.groupby("split_name")["tile_date"].agg(["max", "min"])
summaries["min_date"] = summaries_dates["min"].dt.strftime("%Y-%m-%d %H:%M")
summaries["max_date"] = summaries_dates["max"].dt.strftime("%Y-%m-%d %H:%M")
summaries
summaries.loc[["train","test","val"],["nplumes", "nimages"]].sum()
latex_table_summaries = summaries.loc[["train","val","test"]].rename(rename_splits).reset_index().to_latex(index=False)
print(latex_table_summaries)
summaries.loc[["train","val","test"],["nplumes","nimages","nlocs"]].rename(rename_splits).reset_index().to_dict(orient="records")
Stats by year
summaries_split_year = dataframe_data_traintest.groupby(["split_name","year"])["isplume"].agg(["sum", "count"]).rename(columns={"sum": "nplumes", "count": "nimages"})
summaries_split_year["nlocs"] = dataframe_data_traintest.groupby(["split_name","year"])["location_name"].nunique()
summaries_split_year
Stats by constellation
summaries_split_satellite = dataframe_data_traintest.groupby(["split_name","satellite_constellation"])["isplume"].agg(["sum", "count"]).rename(columns={"sum": "plumes", "count": "images"})
summaries_split_satellite["sites"] = dataframe_data_traintest.groupby(["split_name","satellite_constellation"])["location_name"].nunique()
summaries_split_satellite_date = dataframe_data_traintest.groupby(["split_name","satellite_constellation"])["tile_date"].agg(["max", "min"])
summaries_split_satellite["min date"] = summaries_split_satellite_date["min"].dt.strftime("%Y-%m-%d %H:%M")
summaries_split_satellite["max date"] = summaries_split_satellite_date["max"].dt.strftime("%Y-%m-%d %H:%M")
summaries_split_satellite = summaries_split_satellite[["images","plumes","sites", "min date", "max date"]]
summaries_split_satellite
summaries_split_satellite.loc[["train","val","test"],["plumes","images"]].sum()
latex_table_summary_split_satellite = summaries_split_satellite.loc[["train","val","test"]].to_latex()
print(latex_table_summary_split_satellite)
Stats by country all data
import pandas as pd
def compute_statistics(df: pd.DataFrame, group_cols: list) -> pd.DataFrame:
"""
Compute statistics for a given DataFrame grouped by the specified columns.
Parameters:
- df: The input dataframe containing the data.
- group_cols: A list of columns to group by (e.g., ['country']).
Returns:
- DataFrame with aggregated statistics including:
* images: Count of images
* plumes: Number of plumes detected
* sites: Unique locations (sites)
* sites_with_plumes: Unique sites with ≥1 plume
* sites_seen_in_training: Unique sites seen during training
"""
# Group by specified columns and calculate count of images and sum of plumes
df_all = df.groupby(group_cols)["isplume"].agg(["count", "sum"]).rename({"count": "images", "sum": "plumes"}, axis=1)
# Count unique locations (sites) per group (country in the example)
nsites = df.groupby(group_cols)["location_name"].nunique()
df_all["sites"] = nsites
# Count the number of unique locations with at least one plume
sites_with_plumes = df[df["isplume"] == 1].groupby(group_cols)["location_name"].nunique()
df_all["sites_with_plumes"] = sites_with_plumes
# Count unique sites seen at training time
training_sites = (
df[df["loc_type_2"] == "Sites seen at training time"]
.groupby(group_cols)["location_name"]
.nunique()
)
df_all["sites_seen_in_training"] = training_sites
# Sort the dataframe based on the number of images
df_all.sort_values("images", ascending=False, inplace=True)
return df_all
Stats by country: all data
compute_statistics(dataframe_data_traintest, ["country"])
Stats by country: test data
compute_statistics(dataframe_data_traintest_test_202324, ["country"])
Stats by country: Train data
dataframe_data_traintest_train_202324= dataframe_data_traintest[dataframe_data_traintest.split_name == "train"].copy()
compute_statistics(dataframe_data_traintest_train_202324, ["country"])
Stats by type of location
stats_data_by_loctype = dataframe_data_traintest_test_202324.groupby("loc_type_2")["location_name"].agg(["nunique","count"]).rename(columns={"nunique": "sites", "count": "images"})
nplumes = dataframe_data_traintest_test_202324.groupby("loc_type_2")["isplume"].sum()
stats_data_by_loctype["plumes"] = nplumes
latex_stats_data_by_loctype = stats_data_by_loctype[["images", "plumes", "sites"]].to_latex()
print(latex_stats_data_by_loctype)
locs_training = dataframe_data_traintest_test_202324.loc[dataframe_data_traintest_test_202324["loc_type_2"] == "Sites seen at training time","location_name"].unique().tolist()
len(locs_training)
stats_data_by_loctype
# dataframe_data_traintest.columns
Stats data case studies
test
stats_test = compute_statistics(dataframe_data_traintest_test_202324, ["case_study"])
stats_test
print(stats_test[["images","plumes","sites", "sites_seen_in_training"]].to_latex())
train
compute_statistics(dataframe_data_traintest_train_202324, ["case_study"])
Generate locs gpkg to plot
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
dataframe_data_traintest["loc_type_2"] = dataframe_data_traintest["loc_type"].apply(lambda x: loctypedict[x])
locations = dataframe_data_traintest.location_name.unique()
loc_df = []
for loc in locations:
loc_info_i0 = dataframe_data_traintest[dataframe_data_traintest.location_name == loc].iloc[0].to_dict()
loc_info_copy = {k: loc_info_i0[k] for k in ["country", "offshore", "location_name", "lon", "lat", "sector","case_study","loc_type_2"]}
loc_df.append(loc_info_copy)
loc_df = pd.DataFrame(loc_df)
dataframe_data_traintest_stats = dataframe_data_traintest.groupby("location_name")["isplume"].agg(["count", "sum"]).rename(columns={"count":"nimages", "sum": "nplumes"}).reset_index()
print(loc_df.shape, dataframe_data_traintest_stats.shape)
loc_df = pd.merge(loc_df,dataframe_data_traintest_stats,on="location_name")
print(loc_df.shape)
loc_gdf = gpd.GeoDataFrame(loc_df, geometry=loc_df.apply(lambda row: Point(row.lon,row.lat), axis=1), crs="EPSG:4326")
loc_gdf
from matplotlib import colors
from shapely.geometry import box
from marss2l.mars_sentinel2 import countries
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
try:
loc_gdf.explore(column="loc_type_2",cmap=colors.ListedColormap(["C0","C1","C2"]))
except Exception as e:
print(e)
loc_gdf.to_file("all_locs.gpkg", driver="GPKG",index=False)
Derive shapes of boundaries for the case studies
# Create the grouped dataframe
case_studies_df = loc_gdf.groupby("case_study")[["lat", "lon"]].agg(["min","max"])
# Flatten the multi-level column names
case_studies_df.columns = [f"{col[0]}_{col[1]}" for col in case_studies_df.columns]
case_studies_df.reset_index()
case_studies_df["geometry"] = case_studies_df.apply(lambda x: box(x.lon_min,x.lat_min, x.lon_max, x.lat_max), axis=1)
case_studies_df =gpd.GeoDataFrame(case_studies_df, crs="EPSG:4326")
case_studies_not_index = case_studies_df.reset_index()
case_studies_df
case_studies_not_index[~case_studies_not_index.case_study.isin(["Rest","Offshore"])].plot("case_study", legend=True)
case_studies_not_index.to_file("case_study_boundaries.gpkg", driver="GPKG",index=False)
countries_df = countries.all_countries()
countries_df["case_study"] = countries_df.romnam.apply(loaders._set_case_study)
case_studies_df_boundaries = countries_df[["case_study","geometry"]].dissolve(by="case_study").reset_index()
case_studies_df_boundaries
fig, ax = plt.subplots(1, 1, figsize=(8,10))
case_studies_df_boundaries.plot("case_study", legend=True, ax=ax)
new_geometry = case_studies_df_boundaries.set_index("case_study")["geometry"].intersection(case_studies_df.geometry, align=True)
new_geometry = new_geometry[new_geometry.index != "Offshore"]
case_studies_df_boundaries_intersect = gpd.GeoDataFrame(geometry=new_geometry, crs="EPSG:4326").reset_index()
fig, ax = plt.subplots(1, 1, figsize=(8,10))
case_studies_df_boundaries_intersect.plot("case_study", legend=True, ax=ax)
Basemap plots
from mpl_toolkits.basemap import Basemap
from matplotlib.lines import Line2D
fig, ax = plt.subplots(1,1, figsize=(10,6))
ocean_color = (plt.get_cmap('ocean'))(210)
land_color = plt.get_cmap('gist_earth')(200)
# lon_0 is central longitude of projection.
# resolution = 'c' means use crude resolution coastlines.
m = Basemap(projection='moll',lon_0=0,resolution='c',ax=ax)
m.drawcoastlines()
m.fillcontinents(color=land_color,lake_color=ocean_color)
# draw parallels and meridians.
m.drawparallels(np.arange(-90.,120.,30.))
m.drawmeridians(np.arange(0.,420.,60.))
m.drawmapboundary(fill_color=ocean_color)
x, y = m(loc_gdf.lon,loc_gdf.lat)
colors_scatter = loc_gdf.loc_type_2.apply(lambda x: "blue" if x == "Test only sites" else "C3")
m.scatter(x,y,1,marker='o',color=colors_scatter)
patches = []
for c, interp in zip(["blue", "C3"], ["Test only sites", "sites seen at training time"]):
patches.append(mpatches.Patch(color=c, label=interp))
ax.legend(handles=patches,
loc='lower right')
ax.set_title("MARS-S2L dataset")
stats_sites = stats_data_by_loctype.to_dict()["sites"]
from mpl_toolkits.basemap import Basemap
from matplotlib.lines import Line2D
# First, calculate the overall bounds with some padding
def plot_dataset_locs(projection="merc", min_lon=None, max_lon=None, min_lat=None,
max_lat=None, title="MARS-S2L",colors=None,
legend_case_studies=True,
figsize=(11,6)):
if min_lon is None:
min_lon = case_studies_not_index.loc[~case_studies_not_index.case_study.isin(["Rest", "Offshore"]),'lon_min'].min() - 2 # Add 5 degrees padding
if max_lon is None:
max_lon = case_studies_not_index.loc[~case_studies_not_index.case_study.isin(["Rest", "Offshore"]),'lon_max'].max() + 2
if min_lat is None:
min_lat = case_studies_not_index.loc[~case_studies_not_index.case_study.isin(["Rest", "Offshore"]),'lat_min'].min() - 5
if max_lat is None:
max_lat = case_studies_not_index.loc[~case_studies_not_index.case_study.isin(["Rest", "Offshore"]),'lat_max'].max() + 2
fig, ax = plt.subplots(1,1, figsize=figsize, tight_layout=True)
ocean_color = (plt.get_cmap('ocean'))(210)
land_color = plt.get_cmap('gist_earth')(200)
land_color = "#ededed"
# lon_0 is central longitude of projection.
m = Basemap(projection=projection,lon_0=0,
llcrnrlon=min_lon, llcrnrlat=min_lat,
urcrnrlon=max_lon, urcrnrlat=max_lat,
resolution='c', ax=ax)
m.drawcoastlines()
m.fillcontinents(color=land_color,lake_color=ocean_color)
# draw parallels and meridians.
m.drawparallels(np.arange(-90.,120.,30.))
m.drawmeridians(np.arange(0.,420.,60.))
m.drawmapboundary(fill_color=ocean_color)
# Create a colormap for different case studies
just_one_color=False
legend_patches = []
if colors is None:
colors = plt.cm.tab20(np.linspace(0, 1, len(case_studies_df)))
elif isinstance(colors,str):
# case_studies = [c for c in case_studies_df_boundaries_intersect.case_study.unique().tolist() if c not in ["Rest", "Offshore"]]
# case_studies_txt = "\n".join(case_studies)
colors = [colors for _ in range(len(case_studies_df))]
# legend_patches = [mpatches.Patch(color=colors[0], label="Areas case studies:\n"+case_studies_txt)]
area_plot = box(min_lon,min_lat, max_lon, max_lat)
# Loop through each case study in the results DataFrame
for i, (idx, row) in enumerate(case_studies_df_boundaries_intersect.iterrows()):
case_name = row.case_study
if case_name in ["Rest", "Offshore"]:
continue
geom = row.geometry
if not geom.intersects(area_plot):
continue
color = colors[i]
if geom.geom_type == 'Polygon':
# Extract and transform coordinates
x, y = geom.exterior.xy
map_x, map_y = m(x, y)
# Plot polygon outline and fill
m.plot(map_x, map_y, 'k-', linewidth=1)
ax.fill(map_x, map_y, alpha=0.5, color=color)
elif geom.geom_type == 'MultiPolygon':
# Handle each polygon in the multipolygon
for polygon in geom.geoms:
x, y = polygon.exterior.xy
map_x, map_y = m(x, y)
m.plot(map_x, map_y, 'k-', linewidth=1)
ax.fill(map_x, map_y, alpha=0.5, color=color)
# Create a patch for the legend
# legend_patches.append(mpatches.Patch(color=color, label=case_name))
legend_patches.append(Line2D([0], [0], marker='o', color=color, linestyle='None',
label=case_name))
# Scatterplot of sites
x, y = m(loc_gdf.lon,loc_gdf.lat)
colors_scatter = loc_gdf.loc_type_2.apply(lambda x: "C3" if x == "Test only sites" else "blue")
m.scatter(x,y,7.5,marker='o',color=colors_scatter)
stats_sites = stats_data_by_loctype.to_dict()["sites"]
patches = []
for c, interp in zip(["C3", "blue"], ["Test only sites", "Sites seen at training time"]):
patches.append(Line2D([0], [0], marker='o', color=c, linestyle='None',
label=f"{interp} ({stats_sites[interp]})"))
# patches.append(mpatches.Patch(color=c, label=f"{interp} ({stats_sites[interp]})"))
legend1 = ax.legend(handles=patches,
loc='upper left')
# Add a legend outside the plot
if legend_case_studies:
ax.legend(handles=legend_patches, loc='upper left', bbox_to_anchor=(0,0.9), title="Areas case studies")
ax.add_artist(legend1)
ax.set_title(title)
fig.tight_layout()
return fig, ax
plot_dataset_locs(projection="moll", colors="#5b5b5b", title="")
plt.savefig("figures/fig1_map_with_case_studies.pdf")
plot_dataset_locs(projection="moll", colors="#5b5b5b", title="", legend_case_studies=False)
plt.savefig("figures/fig1_map_with_case_studies_no_casestudieslegend.pdf")
plot_dataset_locs(projection="moll")
plt.savefig("figures/fig1_map_with_case_studies_colors.pdf")
plot_dataset_locs(colors="#5b5b5b")
plot_dataset_locs(projection="moll")
# BBox Permian basin.
max_lat, min_lon = 34.058354, -105.175890
min_lat, max_lon = 29.593088, -99.939494
plot_dataset_locs(min_lon=min_lon, max_lon=max_lon, min_lat=min_lat, max_lat=max_lat, title="Permian")
# BBox Persian Gulf
,
max_lat, min_lon = 26.11167903485588 + 7, 51.22215743823728 - 10
min_lat, max_lon = 26.11167903485588 - 10, 51.22215743823728 + 7
plot_dataset_locs(min_lon=min_lon, max_lon=max_lon, min_lat=min_lat, max_lat=max_lat, title="Persian Gulf")
max_lat, min_lon = 26.11167903485588 + 7, 51.22215743823728 - 10
min_lat, max_lon = 26.11167903485588 - 10, 51.22215743823728 + 7
plot_dataset_locs(min_lon=min_lon, max_lon=max_lon, min_lat=min_lat, max_lat=max_lat, title="Persian Gulf")