from __future__ import annotations

from pathlib import Path

import gc

import os
import base64
import hmac
import hashlib
from typing import Dict, Optional

import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.transforms as transforms
import movingpandas as mpd
import numpy as np
import hvplot.pandas 

from pyproj import Geod

#mpd.show_versions()

# Go to workspace root
def find_repo_root(start: Optional[Path] = None) -> Path:
    
    start = start or Path.cwd()
    markers = (".git", "pyproject.toml", "requirements.txt", "setup.cfg")
    cur = start.resolve()

    for parent in [cur, *cur.parents]:
        if any((parent / m).exists() for m in markers):
            return parent
    return cur

# Configure Ingestion

ROOT = find_repo_root()
DATA_DIR = "data/raw"
DATA_PATH = ROOT / DATA_DIR
CSV_PATH = sorted(DATA_PATH.glob("*.csv"))[0] #using any of our files

# CSV Labels
ID_COL = "nombre"
TIME_COL = "fecha"
LAT_COL = "lat"
LON_COL = "lon"
SPEED_COL = "vel"

# Read CSV file
df = pd.read_csv(CSV_PATH)

print(f"Raw data points: {len(df)} in {CSV_PATH}")
print(f"Raw vehicles: {len(df[ID_COL].unique())}")

Raw data points: 14138871 in C:\fabro\Projects\eda-notebook-workspace\data\raw\datos_telematica_2025-09-02.csv

Raw vehicles: 20673

# Needed functions

# Load KEY=VALUE
def load_kv_file(path: Path) -> Dict[str, str]:
    
    if not path.exists():
        return {}

    out: Dict[str, str] = {}
    for raw in path.read_text(encoding="utf-8").splitlines():
        line = raw.strip()
        if not line or line.startswith("#"):
            continue
        if "=" not in line:
            continue
        k, v = line.split("=", 1)
        k = k.strip()
        v = v.strip().strip("'").strip('"')
        out[k] = v
    return out

# Get Key from file
def get_hmac_key(
    env_var: str = "PLATE_HMAC_KEY_B64",
    secrets_relpath: str = "config/secrets.env",
) -> bytes:
    
    root = find_repo_root()
    secrets_path = root / secrets_relpath
    kv = load_kv_file(secrets_path)
    b64 = kv.get(env_var)

    if not b64:
        raise RuntimeError(
            f"No encontré {env_var}. Seteala en {secrets_relpath}."
        )

    try:
        key = base64.b64decode(b64, validate=True)
    except Exception as e:
        raise ValueError(f"{env_var} no es base64 válido: {e}")

    if len(key) < 16:
        raise ValueError(f"{env_var} demasiado corta ({len(key)} bytes). Usá >= 16, ideal 32.")

    return key

# HMAC -> Base62 -> token corto
_ALPH = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

def _b62_from_int(x: int) -> str:
    if x == 0:
        return _ALPH[0]
    s = []
    while x:
        x, r = divmod(x, 62)
        s.append(_ALPH[r])
    return "".join(reversed(s))

def _bytes_needed_for_base62_len(out_len: int) -> int:
    # bits ~= out_len * log2(62) ; log2(62) ≈ 5.954
    bits = int((out_len * 5.954242509439324) + 7)  # +7 para ceil/8
    return max(8, (bits + 7) // 8)  # mínimo 8 bytes (64b) por robustez

def plate_token_norm(norm_plate: str, key: bytes, out_len: int = 12) -> str:
    msg = (norm_plate or "").encode("utf-8")
    digest = hmac.new(key, msg, hashlib.sha256).digest()

    nbytes = _bytes_needed_for_base62_len(out_len)
    x = int.from_bytes(digest[:nbytes], "big")

    token = _b62_from_int(x).rjust(out_len, _ALPH[0])
    return token[:out_len]

def obfuscate_plate_column(df: pd.DataFrame, col: str, key: bytes, out_len: int = 12, out_col: str | None = None):
    out_col = out_col or col

    s = df[col].astype("string")
    norm = (
        s.fillna("")
         .str.strip()
         .str.upper()
         .str.replace(" ", "", regex=False)
         .str.replace("-", "", regex=False)
    )

    # Factorize
    codes, uniques = pd.factorize(norm, sort=False)

    # Unique tokens
    tokens = np.array([plate_token_norm(u, key=key, out_len=out_len) for u in uniques], dtype=object)

    # Replace column
    df[out_col] = tokens[codes]

    # Preserve NaN
    df.loc[s.isna(), out_col] = pd.NA

    return df

key = get_hmac_key()
df = obfuscate_plate_column(df, col=ID_COL, key=key, out_len=12)

del key
gc.collect()

72

df.head()

#Drop missing values
required_cols = [ID_COL, TIME_COL, LAT_COL, LON_COL, SPEED_COL]
df_clean = df.dropna(subset=required_cols).copy()
print(f"Removed {len(df) - len(df_clean)} rows with missing values in {required_cols}")

Removed 82908 rows with missing values in ['nombre', 'fecha', 'lat', 'lon', 'vel']

# Drop trajectories with fewer than two points
vehicles = df_clean[ID_COL].unique()
traj_sizes = df_clean.groupby(ID_COL).size()
valid_ids = traj_sizes[traj_sizes >= 2].index
df_clean = df_clean[df_clean[ID_COL].isin(valid_ids)]
print(f"Removed {len(vehicles) - len(valid_ids)} vehicles with <2 registries")

Removed 62 vehicles with <2 registries

# Define radius in km of the Home Range area
HOME_RANGE_THRESHOLD = 50

# Sort data by ID and Time
df_clean = df_clean.sort_values([ID_COL, TIME_COL])

# Aggregate data for vehicles
ends = df_clean.groupby(ID_COL).agg(
    lon_start=(LON_COL, "first"),
    lat_start=(LAT_COL, "first"),
    lon_end=(LON_COL, "last"),
    lat_end=(LAT_COL, "last"),
)

# Using pyproj to calculate distance using WGS84 projection
geod = Geod(ellps="WGS84")
_, _, dist_m = geod.inv(
    ends["lon_start"].values,
    ends["lat_start"].values,
    ends["lon_end"].values,
    ends["lat_end"].values,
)
ends["dist_km"] = dist_m / 1000.0

# Filter
ids_to_drop = ends.loc[ends["dist_km"] < HOME_RANGE_THRESHOLD].index
df_filtered = df_clean.loc[~df_clean[ID_COL].isin(ids_to_drop)].copy()

print(f"Dropped {len(ids_to_drop)} vehicles; remaining: {len(df_filtered[ID_COL].unique())}")

Dropped 13184 vehicles; remaining: 7373

PERCENT=0.10

# Sample vehicles and copy dataframe
vehicles = df_filtered[ID_COL].unique()
sampled_ids = pd.Series(vehicles).sample(frac=PERCENT, random_state=42)
df_sampled = df_filtered[df_filtered[ID_COL].isin(sampled_ids)].copy()

print(f"Total vehicles: {len(vehicles)} | Sampled: {len(sampled_ids)}")

Total vehicles: 7373 | Sampled: 737

# Normalize timestamps to UTC
df_sampled[TIME_COL] = pd.to_datetime(df_sampled[TIME_COL], utc=True).dt.tz_localize(None)

# Trim 0 velocity ends of each vehicle 
df_sampled = df_sampled.sort_values([ID_COL, TIME_COL])
df_sampled["date"] = df_sampled[TIME_COL].dt.date

group_cols = [ID_COL, "date"]
value_cols = df_sampled.columns.difference(group_cols)

def trim_zero_edges(g):
    nz_idx = g.index[g[SPEED_COL].fillna(0) != 0]
    if nz_idx.empty: 
        return g.iloc[0:0]  # no movement that day -> drop all rows
    first, last = nz_idx[0], nz_idx[-1]
    return g.loc[first:last]

df_trimmed = (
    df_sampled.groupby(group_cols, group_keys=True)[value_cols]
      .apply(trim_zero_edges)
      .reset_index(level=[0, 1])   # bring group keys back as columns
      .reset_index(drop=True)
)

print(f"Trimmed {len(df_sampled) - len(df_trimmed)} rows")

Trimmed 65729 rows

# Convert to GeoPandas and use EPSG 3857 (UTM) projection
gdf = gpd.GeoDataFrame(
    df_trimmed,
    geometry=gpd.points_from_xy(df_trimmed[LON_COL], df_trimmed[LAT_COL], crs="EPSG:4326").to_crs("EPSG:3857"),
)

DOMINIO="3UEGwOiggnWt"

g = gdf.loc[gdf[ID_COL] == DOMINIO, [ID_COL, TIME_COL, "geometry"]].copy()
if g.empty:
    raise ValueError(f"No se encontraron registros para ID_COL={DOMINIO}")

g = g.sort_values(TIME_COL).set_index(TIME_COL)

traj = mpd.Trajectory(g, traj_id=DOMINIO)

traj_gdf = traj.to_traj_gdf().to_crs(epsg=4326)

traj_gdf.hvplot(
    geo=True,
    tiles="CartoLight",
    rasterize=True,      # similar to datashade, hover on pixels
    line_width=1,
    alpha=0.9,
    #hover=False,         # skip tooltips for speed
    width=900,
    height=600,
)

# Build trajectories
tc = mpd.TrajectoryCollection(gdf, traj_id_col=ID_COL, t=TIME_COL)

# Per-trajectory gap and speed stats (meters, m/s)
records = []
for traj in tc:
    time_series = (
        traj.df["timestamp"]
        if "timestamp" in traj.df.columns
        else traj.df.index.to_series()
    )
    time_series = pd.to_datetime(time_series)

    gaps = time_series.diff().dt.total_seconds()
    traj.add_distance()
    traj.add_speed()

    n_points = len(traj.df)
    coverage_s = (time_series.max() - time_series.min()).total_seconds()
    
    # Main measures

    records.append({
        "id": traj.id,
        "n_points": n_points,
        "coverage_s" : coverage_s,
        "median_gap_s": gaps.median(),
        "p95_gap_s": gaps.quantile(0.95),
        "median_dist_m": traj.df["distance"].median(),
        "p95_dist_m": traj.df["distance"].quantile(0.95),
        "median_speed_mps": traj.df["speed"].median(),
        "p95_speed_mps": traj.df["speed"].quantile(0.95),
        "pct_speed_over_50kph": (traj.df["speed"] > 50/3.6).mean(),
    })

stats = pd.DataFrame(records)

# Basics
stats["coverage_hours"] = stats["coverage_s"] / 3600.0
stats["pts_per_hour"] = stats["n_points"] / stats["coverage_hours"]

# Expected points and Sampling Efficiency
stats["expected_points_median_gap"] = stats["coverage_s"] / stats["median_gap_s"]
stats["sampling_efficiency"] = stats["n_points"] / stats["expected_points_median_gap"]  # ~1 si consistente

# Consistency between distance/gap vs speed
stats["median_speed_kph"] = stats["median_speed_mps"] * 3.6
stats["p95_speed_kph"]    = stats["p95_speed_mps"] * 3.6
stats["implied_median_speed_kph"] = (stats["median_dist_m"] / (stats["median_gap_s"])) * 3.6
stats["implied_p95_speed_kph"]    = (stats["p95_dist_m"]    / (stats["p95_gap_s"])) * 3.6
stats["delta_median_speed_kph"] = stats["implied_median_speed_kph"] - stats["median_speed_kph"]
stats["delta_p95_speed_kph"]    = stats["implied_p95_speed_kph"]    - stats["p95_speed_kph"]

# Tail / Median sampling time gap ratio
stats["gap_tail_ratio"] = stats["p95_gap_s"] / stats["median_gap_s"]

# Left Plot
x = stats["coverage_hours"].to_numpy()
y = stats["n_points"].to_numpy()

# Run a linear regression
a, b = np.polyfit(x, y, 1)

# Fit data to graph our function
x_fit = np.linspace(x.min(), x.max(), 200)
y_fit = a * x_fit + b

# R^2
y_pred = a * x + b
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
r2 = 1 - ss_res / ss_tot if ss_tot > 0 else np.nan

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.scatter(x, y, c="cadetblue",alpha=0.7)
ax1.plot(x_fit, y_fit, color="coral", linewidth=2)

ax1.set_yscale("log")
ax1.set_xlabel("coverage_hours")
ax1.set_ylabel("n_points (log)")
ax1.set_title("Coverage vs Num. of Points")
ax1.grid(True, alpha=0.3)

eq = f"log(y) = log({a:.4f}·x + {b:.4f})\nR² = {r2:.4f}"
ax1.text(
    0.025, 0.97, eq,
    transform=ax1.transAxes,
    va="top", ha="left",
    color="crimson",
    bbox=dict(facecolor="white", alpha=0.8, edgecolor="crimson")
)

# Right Plot
mean_x = stats["median_gap_s"].mean()
mean_y = stats["pts_per_hour"].mean()
tx = transforms.blended_transform_factory(ax2.transData, ax2.transAxes)  # x=data, y=axes
ty = transforms.blended_transform_factory(ax2.transAxes, ax2.transData)  # x=axes, y=data
ax2.scatter(stats["median_gap_s"], stats["pts_per_hour"], alpha=.7)
ax2.set_xscale("log"); ax2.set_yscale("log")
ax2.set_xlabel("median_gap_s (log)")
ax2.set_ylabel("pts_per_hour (log)")
ax2.axvline(mean_x, color="coral", linestyle=":", linewidth=2)
ax2.axhline(mean_y, color="coral", linestyle=":", linewidth=2)
ax2.set_title("Sampling Gap vs Point Density")
ax2.grid(True, alpha=.3)

ax2.text(
    mean_x, 0.98, f"μx = {mean_x:.2f}",
    transform=tx, rotation=90,
    va="top", ha="right", color="red",
    bbox=dict(facecolor="white", alpha=0.8, edgecolor="crimson")
)

ax2.text(
    0.98, mean_y, f"μy = {mean_y:.2f}",
    transform=ty,
    va="top", ha="right", color="red",
    bbox=dict(facecolor="white", alpha=0.8, edgecolor="crimson")
)

plt.tight_layout()
plt.show()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Left Plot
x = stats["sampling_efficiency"]

# Trim tails (1% - 99%)
lo, hi = x.quantile([0.01, 0.99])
x_plot = x.clip(lo, hi)

ax1.hist(x_plot, bins=50, color="darkseagreen", alpha=0.8, rwidth=0.92)

ax1.axvline(1.0, linewidth=1, color="crimson")
ax1.axvline(x.median(), linestyle="--", linewidth=2)
ax1.axvline(x.quantile(0.95), linestyle=":", linewidth=2)

ax1.set_xlabel("sampling_efficiency")
ax1.set_title("Sampling Efficiency")
ax1.grid(True, alpha=.25)
ax1.set_axisbelow(True)

ax1.text(
    0.02, 0.97,
    f"median={x.median():.2f}\nP95={x.quantile(0.95):.2f}\nP99={x.quantile(0.99):.2f}",
    transform=ax1.transAxes, va="top", ha="left",
    bbox=dict(facecolor="white", alpha=0.85, edgecolor="0.7")
)

# Right Plot
g = stats["gap_tail_ratio"]
g_log = np.log10(g)

# Trim tails (1% - 99%)
lo, hi = g_log.quantile([0.01, 0.99])
g_plot = g_log.clip(lo, hi)

ax2.hist(g_plot, bins=40, color="mediumseagreen", alpha=0.8, rwidth=0.92)

ax2.axvline(0.0, linewidth=1, color="crimson")
ax2.axvline(g_log.median(), linestyle="--", linewidth=2)
ax2.axvline(g_log.quantile(0.95), linestyle=":", linewidth=2)

ax2.set_xlabel("log10(p95_gap / median_gap)")
ax2.set_title("Sample Irregularity (Tail ratio)")
ax2.grid(True, alpha=.25)
ax2.set_axisbelow(True)

ax2.text(
    0.83, 0.97,
    f"median={g_log.median():.2f}\nP95={g_log.quantile(0.95):.2f}\nP99={g_log.quantile(0.99):.2f}",
    transform=ax2.transAxes, va="top", ha="left",
    bbox=dict(facecolor="white", alpha=0.85, edgecolor="0.7")
)

plt.tight_layout()
plt.show()

cols = ["median_speed_kph", "implied_median_speed_kph", "gap_tail_ratio","sampling_efficiency","delta_median_speed_kph"]
df = stats[cols]

# Color setup
c = np.log10(df["gap_tail_ratio"])
c_lo, c_hi = np.nanquantile(c, [0.01, 0.99])
c_plot = np.clip(c, c_lo, c_hi)

# Identity scatter limits
lim = np.nanmax([df["median_speed_kph"].max(), df["implied_median_speed_kph"].max()])
lims = (0, lim)

# Tolerance
tol1 = 0.10  # ±10%
tol2 = 0.25  # ±25%

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# =========================
# (1) Consistency
# =========================
x_band = np.linspace(max(lims[0], 0), lims[1], 300)

# ±10% Zone
y_low1  = (1 - tol1) * x_band
y_high1 = (1 + tol1) * x_band
ax1.fill_between(x_band, y_low1, y_high1, color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")

# ±25% Zone
y_low2  = (1 - tol2) * x_band
y_high2 = (1 + tol2) * x_band
ax1.fill_between(x_band, y_low2, y_low1,  color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax1.fill_between(x_band, y_high1, y_high2, color="0.92", alpha=0.9)

# Identity
ax1.plot(x_band, x_band, color="0.6", linewidth=1)

sc1 = ax1.scatter(df["median_speed_kph"], df["implied_median_speed_kph"], c=c_plot, alpha=0.6, s=30)

ax1.set_xlim(lims); ax1.set_ylim(lims)
ax1.set_aspect("equal", adjustable="box")
ax1.set_xlabel("median_speed_kph (reported)")
ax1.set_ylabel("median_speed_kph (implicit dist/gap)")
ax1.set_title("Speed Consistency")
ax1.grid(True, alpha=0.25)
ax1.legend(loc="upper left", framealpha=0.9)


# =========================
# (2) Residuals
# =========================
x_band2 = np.linspace(df["median_speed_kph"].min(), df["median_speed_kph"].max(), 300)
ax2.fill_between(x_band2, -tol1*x_band2,  tol1*x_band2,  color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")
ax2.fill_between(x_band2, -tol2*x_band2, -tol1*x_band2, color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax2.fill_between(x_band2,  tol1*x_band2,  tol2*x_band2, color="0.92", alpha=0.9)

ax2.axhline(0, color="0.6", linewidth=1)

sc2 = ax2.scatter(df["median_speed_kph"], df["delta_median_speed_kph"], c=c_plot, alpha=0.6, s=30)

ax2.set_xlabel("median_speed_kph (reported)")
ax2.set_ylabel("delta_kph (implicit - reported)")
ax2.set_title("Residuals of Implied vs Reported")
ax2.grid(True, alpha=0.25)
ax2.legend(loc="lower right", framealpha=0.9)

cb2 = plt.colorbar(sc2, ax=ax2)
cb2.set_label("log10(gap_tail_ratio)")

fig.suptitle(
    "Speed Analysis (Color by gap_tail_ratio)",
    fontsize=14
)

plt.tight_layout()
plt.show()

cols = ["median_speed_kph", "implied_median_speed_kph", "gap_tail_ratio","sampling_efficiency","delta_median_speed_kph"]
df = stats[cols]

# Color setup
eff = df["sampling_efficiency"].to_numpy()

vmin = np.quantile(eff, 0.05)  
vmax = np.quantile(eff, 0.99)
vcenter = np.mean(eff)
vcenter = np.clip(vcenter, vmin, vmax)
norm = mcolors.TwoSlopeNorm(vmin=vmin, vcenter=vcenter, vmax=vmax)
eff_clip = np.clip(eff, vmin, vmax)

# Tolerance
tol1 = 0.10  # ±10% 
tol2 = 0.25  # ±25% 

# Identity scatter limits
lim = np.nanmax([df["median_speed_kph"].max(), df["implied_median_speed_kph"].max()])
lims = (0, lim)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# =========================
# (1) Consistency
# =========================
x_band = np.linspace(max(lims[0], 0), lims[1], 300)

# ±10% Zone
y_low1  = (1 - tol1) * x_band
y_high1 = (1 + tol1) * x_band
ax1.fill_between(x_band, y_low1, y_high1, color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")

# ±25% Zone
y_low2  = (1 - tol2) * x_band
y_high2 = (1 + tol2) * x_band
ax1.fill_between(x_band, y_low2, y_low1,  color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax1.fill_between(x_band, y_high1, y_high2, color="0.92", alpha=0.9)

# Identity
ax1.plot(x_band, x_band, color="0.6", linewidth=1)

sc1 = ax1.scatter(
    df["median_speed_kph"], df["implied_median_speed_kph"],
    c=eff_clip, norm=norm, cmap="coolwarm_r",
    alpha=0.65, s=30
)

ax1.set_xlim(lims); ax1.set_ylim(lims)
ax1.set_aspect("equal", adjustable="box")
ax1.set_xlabel("median_speed_kph (reported)")
ax1.set_ylabel("median_speed_kph (implicit dist/gap)")
ax1.set_title("Speed Consistency")
ax1.grid(True, alpha=0.25)
ax1.set_axisbelow(True)
ax1.legend(loc="lower right", framealpha=0.9)

# =========================
# (2) Residuals
# =========================
x_band2 = np.linspace(df["median_speed_kph"].min(), df["median_speed_kph"].max(), 300)
ax2.fill_between(x_band2, -tol1*x_band2,  tol1*x_band2,  color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")
ax2.fill_between(x_band2, -tol2*x_band2, -tol1*x_band2, color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax2.fill_between(x_band2,  tol1*x_band2,  tol2*x_band2, color="0.92", alpha=0.9)

ax2.axhline(0, color="0.6", linewidth=1)

sc2 = ax2.scatter(
    df["median_speed_kph"], df["delta_median_speed_kph"],
    c=eff_clip, norm=norm, cmap="coolwarm_r",
    alpha=0.65, s=30
)

ax2.set_xlabel("median_speed_kph (reported)")
ax2.set_ylabel("delta_kph (implicit - reported)")
ax2.set_title("Residuals of Implied vs Reported")
ax2.grid(True, alpha=0.25)
ax2.set_axisbelow(True)
ax2.legend(loc="upper left", framealpha=0.9)

cb2 = plt.colorbar(sc2, ax=ax2)
cb2.set_label("sampling_efficiency")
cb2.set_ticks([vmin, vcenter, vmax])

fig.suptitle(
    "Speed Analysis (Color by sampling_efficiency)",
    fontsize=14
)

plt.tight_layout()
plt.show()

cols = ["id","median_speed_kph","implied_median_speed_kph","gap_tail_ratio","sampling_efficiency","n_points","coverage_hours","delta_median_speed_kph"]
df = stats[cols]

# Definition “out of band”
# For low speed we set a minimum threshold.
MIN_SPEED = 10 
df_eval = df[df["median_speed_kph"] >= MIN_SPEED].copy()
df_eval["abs_delta_kph"] = df_eval["delta_median_speed_kph"].abs()
df_eval["abs_rel_err"] = df_eval["abs_delta_kph"] / df_eval["median_speed_kph"].clip(lower=1e-6)

# Tolerance
tol10, tol25 = 0.10, 0.25
df_eval["out_10pct"] = df_eval["abs_delta_kph"] > (tol10 * df_eval["median_speed_kph"])
df_eval["out_25pct"] = df_eval["abs_delta_kph"] > (tol25 * df_eval["median_speed_kph"])

# Speed Bins
bins = [MIN_SPEED, 20, 40, 60, 80, 100, 120, 160, np.inf]
labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
df_eval["speed_bin"] = pd.cut(df_eval["median_speed_kph"], bins=bins, labels=labels, right=False)

summary = (
    df_eval.groupby("speed_bin", observed=False)
    .agg(
        n=("id","count"),
        out_10pct_rate=("out_10pct", "mean"),
        out_25pct_rate=("out_25pct", "mean"),
        median_abs_delta=("abs_delta_kph","median"),
        p95_abs_delta=("abs_delta_kph", lambda s: s.quantile(0.95)),
        median_gap_tail_ratio=("gap_tail_ratio","median"),
        median_sampling_eff=("sampling_efficiency","median"),
    )
    .reset_index()
)

# Round rates
summary["out_10pct_rate"] = (summary["out_10pct_rate"]).round(4)
summary["out_25pct_rate"] = (summary["out_25pct_rate"]).round(4)

summary

SCORING_BY = "abs_rel_err"

top = (
    df_eval.assign(score=df_eval[SCORING_BY])
    .sort_values("score", ascending=False)
    .head(20)
    .copy()
)

cols_show = [
    "id",
    "median_speed_kph",
    "implied_median_speed_kph",
    "delta_median_speed_kph",
    "abs_delta_kph",
    "abs_rel_err",
    "gap_tail_ratio",
    "sampling_efficiency",
    "n_points",
    "coverage_hours",
]

top[cols_show].assign(
    abs_rel_err=lambda d: (d["abs_rel_err"]).round(3),  # rate error
    delta_median_speed_kph=lambda d: d["delta_median_speed_kph"].round(1),
    abs_delta_kph=lambda d: d["abs_delta_kph"].round(1),
    median_speed_kph=lambda d: d["median_speed_kph"].round(1),
    implied_median_speed_kph=lambda d: d["implied_median_speed_kph"].round(1),
    gap_tail_ratio=lambda d: d["gap_tail_ratio"].round(2),
    sampling_efficiency=lambda d: d["sampling_efficiency"].round(2),
    coverage_hours=lambda d: d["coverage_hours"].round(2),
).rename(columns={"abs_rel_err":"abs_rel_err_rate"})

def robust_z(x: pd.Series) -> pd.Series:
    x = x.astype(float)
    med = np.nanmedian(x)
    mad = np.nanmedian(np.abs(x - med)) + 1e-9
    return 0.6745 * (x - med) / mad

def safe_log10(s: pd.Series) -> pd.Series:
    s = s.astype(float)
    return np.log10(s.where(s > 0))

def clip_abs(s: pd.Series, cap: float = 6.0) -> pd.Series:
    """Capea para que un solo feature no domine todo el score."""
    return s.abs().clip(upper=cap)

# -------------------------
# Ensure required columns
# -------------------------
required = ["id","p95_gap_s","gap_tail_ratio","p95_dist_m","sampling_efficiency","n_points","coverage_hours",
            "median_speed_kph","implied_median_speed_kph", "delta_median_speed_kph"]

df = stats[required].copy()
df["abs_delta_median_speed_kph"] = df["delta_median_speed_kph"].abs()

# -------------------------
# Robust z-scores
# -------------------------
z = pd.DataFrame(index=df.index)

# Sampling irregularity / gaps (log)
z["z_log_p95_gap_s"]     = robust_z(safe_log10(df["p95_gap_s"]))
z["z_log_gap_tail_ratio"] = robust_z(safe_log10(df["gap_tail_ratio"]))

# Sampling efficiency (sin log; buscamos extremos respecto del fleet)
z["z_sampling_eff"] = robust_z(df["sampling_efficiency"])

# GPS jumps proxy (log dist)
z["z_log_p95_dist_m"] = robust_z(safe_log10(df["p95_dist_m"]))

# Consistency (error en velocidad)
z["z_abs_delta_speed"] = robust_z(df["abs_delta_median_speed_kph"])

# -------------------------
# Component scores (cap + average)
# -------------------------
CAP = 6.0

sampling_score = (
    clip_abs(z["z_log_p95_gap_s"], CAP) +
    clip_abs(z["z_log_gap_tail_ratio"], CAP) +
    clip_abs(z["z_sampling_eff"], CAP)
) / 3.0

gps_score = clip_abs(z["z_log_p95_dist_m"], CAP)

consistency_score = clip_abs(z["z_abs_delta_speed"], CAP)

# -------------------------
# Reliability weight (para no sobrerreaccionar con pocos datos)
# -------------------------
# Ajustá estos umbrales a tu caso (son razonables para 1 día de telemetría)
MIN_POINTS_FOR_FULL = 200
MIN_COV_H_FOR_FULL  = 4.0

w_points = np.clip(np.log10(df["n_points"].clip(lower=2)) / np.log10(MIN_POINTS_FOR_FULL), 0, 1)
w_cov    = np.clip(df["coverage_hours"] / MIN_COV_H_FOR_FULL, 0, 1)

reliability_w = (w_points * w_cov).fillna(0)

# -------------------------
# Total outlier score (pesos por familia)
# -------------------------
W_SAMPLING = 1.0
W_GPS      = 1.0
W_CONS     = 1.5  # suele ser más “grave” para tu diagnóstico de consistencia

raw_score = (W_SAMPLING * sampling_score) + (W_GPS * gps_score) + (W_CONS * consistency_score)
outlier_score = raw_score * reliability_w

# -------------------------
# Output table + flags
# -------------------------
out = pd.DataFrame({
    "id": df["id"],
    "sampling_score": sampling_score,
    "gps_score": gps_score,
    "consistency_score": consistency_score,
    "raw_score": raw_score,
    "reliability_w": reliability_w,
    "outlier_score": outlier_score,

    "sampling_efficiency": df["sampling_efficiency"],
    "gap_tail_ratio": df["gap_tail_ratio"],
    "p95_gap_s": df["p95_gap_s"],
    "p95_dist_m": df["p95_dist_m"],
    "abs_delta_median_speed_kph": df["abs_delta_median_speed_kph"],
    "n_points": df["n_points"],
    "coverage_hours": df["coverage_hours"],
})

# flags MAD-z > 3.5 (regla práctica)
TH = 3.5
out["flag_sampling"]    = (z["z_log_p95_gap_s"].abs() > TH) | (z["z_log_gap_tail_ratio"].abs() > TH) | (z["z_sampling_eff"].abs() > TH)
out["flag_gps"]         = (z["z_log_p95_dist_m"].abs() > TH)
out["flag_consistency"] = (z["z_abs_delta_speed"].abs() > TH)

# Top N outliers
out.sort_values("outlier_score", ascending=False).head(25)

# Convert GeoPandas to a Trajectory Collection using MovingPandas
traj_collection = mpd.TrajectoryCollection(gdf, traj_id_col=ID_COL, t=TIME_COL)

# Remove distance outliers 
MAX_LEN_KM = 2_000

filtered = [traj for traj in traj_collection if traj.get_length()/1000 <= MAX_LEN_KM]
traj_collection_filtered = mpd.TrajectoryCollection(filtered)

print(f"Kept {len(traj_collection_filtered)} of {len(traj_collection)} trajectories")

Kept 723 of 728 trajectories

traj_lengths_km = pd.Series(
    [traj.get_length() / 1000 for traj in traj_collection_filtered.trajectories],
    name="trajectory_length_km",
)

bin_width = 100
max_len = np.ceil(traj_lengths_km.max() / bin_width) * bin_width
bins = np.arange(0, max_len + bin_width, bin_width)
bin_centers = 0.5 * (bins[1:] + bins[:-1])

fig, ax = plt.subplots(figsize=(10, 4))

# Cumulative share on left axis
counts_raw, _ = np.histogram(traj_lengths_km, bins=bins)
cumulative = counts_raw.cumsum() / counts_raw.sum()
ax.plot(bin_centers, cumulative, color="C1", marker="o", ms=3, label="Cumulative share")
ax.set_ylabel("Cumulative share")
ax.set_ylim(0, 1.15)
ax.yaxis.grid(True, linestyle="--", alpha=0.6)
ax.set_title("Trajectory lengths")

# Histogram on right axis
ax_count = ax.twinx()
counts, edges, patches = ax_count.hist(
    traj_lengths_km, bins=bins, color="C0", alpha=0.6, edgecolor="white"
)
ax_count.tick_params(labelright=False,right=False)
ax_count.grid(False)

# Annotate each bin with its count
for rect, count in zip(patches, counts):
    if count == 0:
        continue
    x = rect.get_x() + rect.get_width() / 2
    y = rect.get_height()
    ax_count.text(x, y, f"{int(count)}", ha="center", va="bottom", fontsize=8, color="black")

# Annotate each cumulative point (same color as the line)
for x, y in zip(bin_centers, cumulative):
    ax.text(x, y+.02, f"{y:.2f}", ha="center", va="bottom", fontsize=8, color="C1")


# X labels: end of each interval, centered on the bar
ax.set_xlabel("Bin upper end (km)")
ax.set_xticks(bin_centers)
ax.set_xticklabels([f"{edge:.0f}" for edge in edges[1:]], rotation=45, ha="right")

plt.tight_layout()
plt.show()

# Remove short trips
MIN_LEN_KM = 50

filtered = [traj for traj in traj_collection_filtered if traj.get_length()/1000 >= MIN_LEN_KM]
traj_collection_longdistance = mpd.TrajectoryCollection(filtered)

print(f"Kept {len(traj_collection_longdistance)} of {len(traj_collection_filtered)} trajectories")

Kept 722 of 723 trajectories

traj_gdf = traj_collection_longdistance.to_traj_gdf().to_crs(epsg=4326)

traj_gdf.hvplot(
    geo=True,
    tiles="CartoLight",
    line_width=1,
    alpha=0.9,
    datashade=True,       # rasterize + shade by density
    aggregator='count',   # counts per pixel
    cmap='hot',
    hover=False,         # skip tooltips for speed
    width=900,
    height=600,
)

	nombre	fecha	idtipoexterno	segundosdemorarecepcion	lat	lon	vel	rumbo
0	nzb6PvZHUPEZ	2025-09-02 02:46:41+00	1	4	NaN	NaN	NaN	NaN
1	XuzAaa26JPjR	2025-09-02 02:46:42+00	1	3	-32.85436	-61.34860	82.0	266.0
2	OL7Nf8ksDelc	2025-09-02 02:46:41+00	1	4	-34.61745	-60.40863	0.0	NaN
3	zT7zfozMrWsX	2025-09-02 02:46:40+00	1006	5	-33.53368	-60.10854	82.0	319.0
4	TtPO38583MG3	2025-09-02 02:46:41+00	1	4	-34.31298	-60.23689	0.0	NaN

	speed_bin	n	out_10pct_rate	out_25pct_rate	median_abs_delta	p95_abs_delta	median_gap_tail_ratio	median_sampling_eff
0	10-20	66	0.9091	0.8636	9.917205	14.965177	4.730206	0.641597
1	20-40	112	0.8929	0.8214	15.579568	27.303946	4.078311	0.576172
2	40-60	80	0.8000	0.6000	16.188403	35.582364	3.457812	0.554763
3	60-80	103	0.6699	0.5049	17.111441	53.443365	2.542373	0.667343
4	80-100	202	0.3465	0.2277	2.701474	59.466536	1.896023	0.729820
5	100-120	48	0.3125	0.2292	1.865364	60.945672	1.355311	0.699619
6	120-160	5	0.6000	0.2000	12.268467	33.681922	1.016667	0.721329
7	160-inf	0	NaN	NaN	NaN	NaN	NaN	NaN

	id	median_speed_kph	implied_median_speed_kph	delta_median_speed_kph	abs_delta_kph	abs_rel_err_rate	gap_tail_ratio	sampling_efficiency	n_points	coverage_hours
333	3UEGwOiggnWt	20.6	43.0	22.4	22.4	1.088	34.00	0.05	3652	20.39
666	Y1qBOymx8YGb	28.9	59.3	30.4	30.4	1.052	30.00	0.06	2143	10.67
425	4MB0oH5pe80M	23.7	1.0	-22.7	22.7	0.957	6.58	0.56	164	14.62
591	5oPliRq1UI1v	12.6	0.6	-12.0	12.0	0.952	2.29	1.07	783	21.28
188	2S4buidKVaeo	10.4	0.5	-9.9	9.9	0.950	2.52	1.05	501	15.98
199	2UxsOtbEaYvc	10.4	0.5	-9.8	9.8	0.947	2.67	1.01	434	13.55
198	2UNu3nAvubkL	20.0	1.1	-18.9	18.9	0.944	3.22	0.90	395	11.40
576	5edKs4Tl9QiP	13.4	0.8	-12.5	12.5	0.938	2.66	0.99	372	11.79
95	1lGQ1tCRHXrn	11.0	0.7	-10.3	10.3	0.933	5.01	0.56	261	23.42
46	1Kyg7cHK3Mpi	12.5	0.9	-11.6	11.6	0.930	5.36	0.66	294	20.90
73	1YoE4zWDpLth	20.8	1.5	-19.4	19.4	0.929	2.77	1.14	805	21.23
91	1kENfr6FX4UD	32.4	2.7	-29.6	29.6	0.915	2.96	0.76	1017	14.39
109	1qBbPK8P73tB	23.4	2.0	-21.4	21.4	0.913	4.97	0.68	287	21.14
343	3ZZrHcZzX2eI	19.5	1.9	-17.7	17.7	0.904	2.34	0.93	624	23.97
495	4zQiEZ6eO8jr	10.5	1.0	-9.4	9.4	0.902	4.95	0.65	799	20.95
391	43m3itVV18e6	26.7	2.6	-24.1	24.1	0.901	1.02	1.52	655	14.19
471	4kiueP9NP0EV	17.0	1.7	-15.3	15.3	0.901	1.07	1.15	1457	21.12
724	xkvxf4FjNsv9	14.2	1.4	-12.7	12.7	0.900	4.56	0.65	542	15.38
631	GZcaYHMAWfYq	35.5	3.6	-31.9	31.9	0.898	2.70	0.93	676	17.95
256	2wOOQjytBs5k	60.7	6.3	-54.4	54.4	0.896	1.22	1.34	2035	21.17

	id	sampling_score	gps_score	consistency_score	raw_score	reliability_w	outlier_score	sampling_efficiency	gap_tail_ratio	p95_gap_s	p95_dist_m	abs_delta_median_speed_kph	n_points	coverage_hours	flag_sampling	flag_gps	flag_consistency
181	2Nrtauu0kF4Y	0.466861	1.277099	6.000000	10.743960	1.0	10.743960	0.701068	4.448529	605.00	6200.494278	72.287795	392	21.123333	False	False	True
688	jAESbgsgb8vp	1.503399	0.206251	6.000000	10.709650	1.0	10.709650	0.200353	40.000000	600.00	1626.077933	74.032635	931	19.361667	False	False	True
618	6n6r55ztR50L	0.289181	1.127169	6.000000	10.416350	1.0	10.416350	0.745604	3.703704	300.00	5415.927354	72.533799	702	21.184167	False	False	True
376	3wsd1LVYPxrr	0.575109	0.664936	6.000000	10.240045	1.0	10.240045	0.616529	1.875000	30.00	1074.967276	73.062759	1367	9.854444	False	False	True
372	3qfQAgdkd10D	0.652426	0.303406	6.000000	9.955832	1.0	9.955832	0.356332	4.991667	89.85	1489.598119	76.256423	1533	21.510833	False	False	True
566	5ZSBX8WIm5fg	0.496187	1.031908	5.536344	9.832612	1.0	9.832612	1.037190	2.281369	300.00	4969.841005	65.407326	581	20.461667	False	False	True
160	2CwyANdzkUrA	0.614027	0.122079	6.000000	9.736106	1.0	9.736106	0.954160	1.848485	61.00	2186.780331	73.540335	890	8.550278	False	False	True
403	49G3rMQ9HZGo	0.380000	0.627262	5.802354	9.710793	1.0	9.710793	0.776722	4.615385	300.00	3449.632403	68.191449	816	18.968611	False	False	True
144	26JvjyZhrZHq	0.358238	0.971186	5.360266	9.369823	1.0	9.369823	0.918966	2.510460	300.00	4704.867883	63.564446	587	21.203333	False	False	True
200	2VQjbvCWyNSQ	0.253590	0.860352	5.378720	9.182022	1.0	9.182022	0.595578	3.485714	122.00	4257.107601	63.757589	698	11.394167	False	False	True
665	Xejt5XDWB68Z	0.842640	0.069291	5.454461	9.093623	1.0	9.093623	0.284173	5.545455	61.00	1839.977427	64.550316	1209	12.999722	False	False	True
467	4iNWUPtXU3IH	0.469167	0.942165	5.108229	9.073675	1.0	9.073675	1.000646	2.222222	300.00	4583.262428	60.926558	436	16.339444	False	False	True
272	34P0C6N0jwsv	0.487800	0.092410	5.608647	8.993181	1.0	8.993181	0.482971	3.000000	60.00	1801.991926	66.164067	1801	20.716667	False	False	True
143	267wFnyIxqAn	0.997715	1.251219	4.470444	8.954600	1.0	8.954600	0.910156	1.052632	20.00	633.354666	54.251332	2539	14.723056	False	False	True
56	1NodC6MiUG6A	0.292117	1.111426	4.974204	8.864849	1.0	8.864849	0.776739	3.370787	300.00	5339.538726	59.523816	490	15.595833	False	False	True
588	5lV9qKye5C2I	0.310671	0.677006	5.023556	8.523011	1.0	8.523011	0.846620	2.839623	301.00	3607.998150	60.040344	589	20.484722	False	False	True
599	5t7JFDNbMKgT	0.388641	0.955166	4.725812	8.432525	1.0	8.432525	0.955052	2.542373	300.00	4637.344983	56.924079	327	11.222778	False	False	True
482	4q4ctvIRr0ZR	0.687679	0.760989	4.598004	8.345674	1.0	8.345674	0.470645	9.709677	301.00	3892.035737	55.586408	1158	21.187222	False	False	True
207	2bICz2yS4YQz	0.552059	0.274787	4.864748	8.123968	1.0	8.123968	0.526253	7.228916	300.00	2509.848149	58.378217	1095	23.986389	False	False	True
131	223uDa5Ke2hV	0.672898	0.802181	4.354180	8.006350	1.0	8.006350	0.539201	1.714286	30.00	949.757311	53.034485	2367	21.339444	False	False	True
361	3hmcNb4X0gGJ	0.611792	0.440490	4.633551	8.002609	1.0	8.002609	1.186349	2.666667	300.00	2914.620493	55.958453	549	14.461389	False	False	True
256	2wOOQjytBs5k	1.088432	0.165606	4.480428	7.974679	1.0	7.974679	1.335022	1.220000	61.00	1686.822249	54.355826	2035	21.171111	False	False	True
356	3eeAlSDySqwp	1.218776	0.285790	4.200961	7.806008	1.0	7.806008	0.272422	26.353333	395.30	1513.464613	51.430856	875	13.383056	False	False	True
108	1pXAjoLACCV8	0.531891	0.581032	4.373368	7.672975	1.0	7.672975	0.520330	6.593407	300.00	3308.692878	53.235306	821	19.942222	False	False	True
536	5KMSlf784G2M	0.142101	0.671994	4.559414	7.653215	1.0	7.653215	0.667057	3.440476	144.50	3591.714967	55.182515	760	13.292222	False	False	True

Data Overview¶

Ingestion¶

Ofuscación¶

Data Cleaning and Filtering¶

Basic Clean¶

Discard Home Ranged Vehicles¶

Sampling¶

Pre-Processing¶

Inspect¶

Trajectory¶

Analysis¶

Sampling Quality¶

Run Statistics¶

Plotting + Description¶

Scoring¶

Cinematics¶

Trajectory¶

Distance histogram¶

Long Distance Trajectories Map¶