#!/bin/bash
set -euo pipefail

# =============================================================
#  BTRFS Backup Script — rsync to TYH-BACKUP external NTFS drive
# =============================================================

# Config
log_file="/home/rogeryu/btrfs_backup.log"
rsync_log_file="/home/rogeryu/btrfs_backup_rsync.log"
lock_file="/var/lock/btrfs-backup.lock"
backup_base="/media/rogeryu/TYH-BACKUP/btrfs-backup"
current_backup="$backup_base/$(date +%F_%H%M)"
min_free_gb=50
telegram_token="8631671300:AAG3N_TxnjK113ULNidvLr6ebeg6T1PKt0E"
telegram_chat_id="8325862820"

files_to_backup=(
    "/mnt/btrfs-raid/media/"
    "/mnt/btrfs-raid/immich-uploads/"
    "/mnt/btrfs-raid/OL-Drive/"
)

notify_telegram() {
    local message="$1"
    curl -s -X POST "https://api.telegram.org/bot${telegram_token}/sendMessage" \
        -d "chat_id=${telegram_chat_id}" \
        -d "text=${message}" \
        -d "parse_mode=Markdown" > /dev/null 2>&1 || true
}

log() {
    echo "[$(date '+%F %H:%M:%S')] $1" >> "$log_file"
}

# ---- Rotate logs if they get too big (keep last 1000 lines) ----
rotate_log() {
    local f="$1"
    if [ -f "$f" ] && [ "$(wc -c < "$f" 2>/dev/null || echo 0)" -gt 10485760 ]; then
        tail -n 1000 "$f" > "${f}.tmp" && mv "${f}.tmp" "$f"
    fi
}
rotate_log "$log_file"
rotate_log "$rsync_log_file"

# ---- Stale lock cleanup ----
# If lock file exists but no process holds it, clean it
if [ -f "$lock_file" ]; then
    if ! flock -n "$lock_file" true 2>/dev/null; then
        # Someone holds the lock — check if the process is alive
        lock_pid=$(cat "$lock_file" 2>/dev/null || echo "")
        if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
            echo "[$(date '+%F %H:%M:%S')] FATAL: Another backup instance is already running (pid $lock_pid). Aborting." >> "$log_file"
            notify_telegram "🔴 *Backup Aborted* — Another instance is already running (pid $lock_pid)"
            exit 1
        fi
        echo "[$(date '+%F %H:%M:%S')] WARN: Stale lock file found (pid $lock_pid not alive). Cleaning up." >> "$log_file"
        rm -f "$lock_file"
    fi
fi

# ---- Concurrency guard (flock) ----
exec 200>"$lock_file"
flock -n 200 || {
    echo "[$(date '+%F %H:%M:%S')] FATAL: Another backup instance is already running (lock held). Aborting." >> "$log_file"
    notify_telegram "🔴 *Backup Aborted* — Another instance is already running"
    exit 1
}
echo $$ > "$lock_file"

# ---- Trap: ensure notification fires even on unexpected exit ----
started_backup=false
completed_normally=false

cleanup() {
    if [ "$started_backup" = true ] && [ "$completed_normally" = false ]; then
        log "Backup was interrupted at $(date)"
        notify_telegram "⚠️ *Backup was interrupted* — check \`$log_file\`"
    fi
}

trap cleanup EXIT
trap 'exit 1' TERM INT HUP

log "=== Backup run started ==="

# ---- Pre-flight checks ----

# 1. Check TYH-BACKUP is mounted
if ! mountpoint -q /media/rogeryu/TYH-BACKUP; then
    log "FATAL: TYH-BACKUP is not mounted. Aborting."
    notify_telegram "🔴 *Backup Failed* — TYH-BACKUP drive is not mounted"
    exit 1
fi

# 2. Check source directories exist
for src in "${files_to_backup[@]}"; do
    if [ ! -d "$src" ]; then
        log "FATAL: Source directory $src does not exist. Aborting."
        notify_telegram "🔴 *Backup Failed* — Source directory \`$src\` not found"
        exit 1
    fi
done

# 3. CLEANUP FIRST! Free up space before checking.
#    On NTFS, --link-dest doesn't create real hardlinks, so every snapshot
#    is a full copy. We must be aggressive about pruning.
#    Keep at most 1 previous snapshot for --link-dest, delete everything else.
log "Pruning old snapshots (keeping at most 1 previous)..."
prev_snapshot=""
snapshots=()
while IFS= read -r d; do
    snapshots+=("$d")
done < <(find "$backup_base" -maxdepth 1 -type d -name '????-??-??_*' 2>/dev/null | sort -r)

if [ ${#snapshots[@]} -ge 2 ]; then
    # Keep the most recent one (index 0), delete the rest
    prev_snapshot="${snapshots[0]}"
    for ((i=1; i<${#snapshots[@]}; i++)); do
        snap="${snapshots[$i]}"
        snap_name=$(basename "$snap")
        log "Removing old snapshot: $snap_name"
        # NTFS-3G safe deletion (rm -rf fails on deep directory trees)
        find "$snap" -depth ! -name . -delete 2>/dev/null || \
          (find "$snap" -type f -delete 2>/dev/null; find "$snap" -depth -type d -empty -delete 2>/dev/null; rm -rf "$snap" 2>/dev/null) || \
          log "WARN: Could not remove $snap_name"
    done
elif [ ${#snapshots[@]} -eq 1 ]; then
    prev_snapshot="${snapshots[0]}"
fi

if [ -n "$prev_snapshot" ] && [ -d "$prev_snapshot" ]; then
    log "Previous snapshot kept: $(basename $prev_snapshot)"
else
    log "No previous snapshot — full copy"
fi

# Also clean _old directory (stale deferred deletions)
if [ -d "$backup_base/_old" ]; then
    old_count=$(find "$backup_base/_old" -maxdepth 1 -type d -name '????-??-??_*' 2>/dev/null | wc -l)
    if [ "$old_count" -gt 0 ]; then
        log "Cleaning _old directory ($old_count snapshot(s))..."
        find "$backup_base/_old" -depth ! -name . -delete 2>/dev/null || rm -rf "$backup_base/_old/"* 2>/dev/null || true
    fi
fi

# Empty NTFS Trash (backups go here instead of being truly deleted)
log "Emptying NTFS Trash..."
find "$backup_base/../.Trash-1000/files" -depth ! -name . -delete 2>/dev/null || rm -rf "$backup_base/../.Trash-1000/files/"* 2>/dev/null || true
find "$backup_base/../.Trash-1000/info" -depth ! -name . -delete 2>/dev/null || rm -rf "$backup_base/../.Trash-1000/info/"* 2>/dev/null || true

# 4. Check free disk space AFTER cleanup
available_kb=$(df --output=avail /media/rogeryu/TYH-BACKUP | tail -1)
available_gb=$((available_kb / 1024 / 1024))

log "Disk space check AFTER cleanup: ${available_gb}GB free (minimum: ${min_free_gb}GB)"

if [ "$available_gb" -lt "$min_free_gb" ]; then
    log "FATAL: Only ${available_gb}GB free after cleanup — below ${min_free_gb}GB threshold. Aborting."
    notify_telegram "🔴 *Backup Aborted* — Only ${available_gb}GB free after cleanup (threshold: ${min_free_gb}GB)"
    exit 1
fi

# 5. Pre-flight: check immich containers are healthy before backing up immich-uploads
log "Pre-flight health check for immich containers..."
immich_healthy=true
for container in immich_postgres immich_redis immich_server immich_machine_learning; do
    status=$(podman inspect "$container" --format '{{.State.Status}}' 2>/dev/null || echo "missing")
    if [ "$status" != "running" ]; then
        log "WARN: Container $container is $status (not running)"
        immich_healthy=false
    fi
done
if [ "$immich_healthy" = false ]; then
    log "WARN: Some immich containers are unhealthy. Will attempt backup anyway but may fail."
fi

# ---- Run backups with auto-retry ----
mkdir -p "$current_backup"

started_backup=true
overall_success=true
overall_errors=""

run_rsync() {
    local src="$1"
    local dest="$2"
    local src_name="$3"
    local timeout_seconds="$4"
    local attempt="$5"

    if timeout $timeout_seconds rsync -a --delete \
        --no-inc-recursive \
        --modify-window=1 \
        --info=progress2,stats2 \
        --log-file="$rsync_log_file" \
        --exclude='*.tmp' --exclude='*.log' --exclude='*.cache' \
        "$src" "$dest" 2>&1; then
        return 0
    else
        return $?
    fi
}

for src in "${files_to_backup[@]}"; do
    src_name=$(basename "$src")
    dest_path="$current_backup/$src_name"
    log "Starting backup of $src_name..."

    # Per-source timeout: media ~7 min, immich-uploads ~40 min, OL-Drive ~2 min
    # with 3x safety margin for worst-case conditions
    timeout_seconds=10800  # default 3 hours
    if [ "$src_name" = "immich-uploads" ]; then
        timeout_seconds=10800
    elif [ "$src_name" = "media" ]; then
        timeout_seconds=3600
    elif [ "$src_name" = "OL-Drive" ]; then
        timeout_seconds=3600
    fi

    # === Attempt 1 ===
    if run_rsync "$src" "$dest_path" "$src_name" "$timeout_seconds" 1; then
        log "SUCCESS: Backed up $src_name"
        continue
    fi

    exit_code=$?
    if [ $exit_code -eq 124 ]; then
        log "ERROR: rsync for $src_name timed out after ${timeout_seconds}s — will retry"
        overall_errors="${overall_errors}${src_name} timed out; "
    else
        log "ERROR: rsync for $src_name failed with exit code $exit_code — will retry"
        overall_errors="${overall_errors}${src_name} exit $exit_code; "
    fi

    # === Auto-heal before retry ===
    # 1. Clean up partial destination
    if [ -d "$dest_path" ]; then
        log "AUTO-HEAL: Removing partial $src_name from failed transfer..."
        find "$dest_path" -depth ! -name . -delete 2>/dev/null || \
            (find "$dest_path" -type f -delete 2>/dev/null; find "$dest_path" -depth -type d -empty -delete 2>/dev/null; rm -rf "$dest_path" 2>/dev/null) || \
            log "WARN: Could not clean partial $dest_path"
        # Also remove the parent dir itself if empty
        rmdir "$dest_path" 2>/dev/null || true
    fi

    # 2. For immich-uploads, attempt container restart if needed
    if [ "$src_name" = "immich-uploads" ]; then
        log "AUTO-HEAL: Checking immich containers before retry..."
        # Try to restart any non-running immich containers
        for container in immich_postgres immich_redis immich_server immich_machine_learning; do
            status=$(podman inspect "$container" --format '{{.State.Status}}' 2>/dev/null || echo "missing")
            if [ "$status" != "running" ]; then
                log "AUTO-HEAL: Container $container is $status — attempting restart..."
                log "AUTO-HEAL: Stopping and removing $container..."
                podman rm -f "$container" 2>/dev/null || true
                log "AUTO-HEAL: Recreating $container via podman-compose..."
                cd /home/rogeryu/immich-app && podman-compose up -d 2>&1 | while IFS= read -r line; do
                    log "COMPOSE: $line"
                done
                log "AUTO-HEAL: Waiting 15s for containers to stabilize..."
                sleep 15
                break  # restart compose once
            fi
        done
    fi

    # 3. Re-check disk space on backup drive
    retry_avail_kb=$(df --output=avail /media/rogeryu/TYH-BACKUP | tail -1)
    retry_avail_gb=$((retry_avail_kb / 1024 / 1024))
    if [ "$retry_avail_gb" -lt 5 ]; then
        log "FATAL: Only ${retry_avail_gb}GB free on backup drive — cannot retry"
        overall_success=false
        continue
    fi

    # === Attempt 2 (retry) ===
    log "RETRY: Starting retry of $src_name (attempt 2)..."
    if run_rsync "$src" "$dest_path" "$src_name" "$timeout_seconds" 2; then
        log "SUCCESS: $src_name backed up on retry"
    else
        retry_code=$?
        if [ $retry_code -eq 124 ]; then
            log "ERROR: Retry of $src_name also timed out — giving up"
        else
            log "ERROR: Retry of $src_name also failed (exit $retry_code) — giving up"
        fi
        overall_success=false
    fi
done

# ---- Final report ----
completed_normally=true
if [ "$overall_success" = true ]; then
    log "Backup completed successfully at $(date)"
    notify_telegram "✅ *Backup completed successfully* — All directories backed up to TYH-BACKUP"
else
    log "Backup completed WITH ERRORS at $(date)"
    notify_telegram "⚠️ *Backup completed with errors* — check \`$log_file\` for details"
fi