#!/bin/sh

# OCRmyPDF Continuous Watch Daemon (foreground mode for procd)
set -euo pipefail

INDIR="/share/CACHEDEV1_DATA/ecoDMS/ScanInput/HR"
OUTDIR="/share/CACHEDEV1_DATA/ecoDMS/ScanInput"
DOCKER_IMAGE="ocrmypdf"
LANGUAGES="ukr+hrv+deu+eng+rus"
LOG_FILE="/opt/var/log/scanslavic.log"

mkdir -p "$OUTDIR"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Daemon started (PID: $$)" >> "$LOG_FILE"


# Helper function to check if PDF has meaningful text
has_text_layer() {
    local pdf_file="$1"
    local temp_txt=$(mktemp)
    trap "rm -f $temp_txt" EXIT
    
    # Extract text from PDF
    pdftotext "$pdf_file" "$temp_txt" 2>/dev/null || return 1
    
    # Remove all whitespace, control characters, and form feeds
    # Keep only printable characters (including non-ASCII for Cyrillic, etc.)
    local text_content=$(tr -cd '[:print:][:space:]' < "$temp_txt" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' | tr -s '[:space:]')
    
    # If text is longer than 20 characters, consider it meaningful
    if [ ${#text_content} -gt 20 ]; then
        rm -f "$temp_txt"
        return 0  # Has meaningful text
    else
        rm -f "$temp_txt"
        return 1  # No meaningful text
    fi
}


process_pdf_file() {
    local pdf_file="$1"
	 
    [[ "$(echo "$pdf_file" | tr '[:upper:]' '[:lower:]')" == *.pdf ]] || continue
    sleep 1
    [ -f "$pdf_file" ] || continue
    
    pdf_name=$(basename "$pdf_file")
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] Processing: $pdf_name" >> "$LOG_FILE"
    
    
    if has_text_layer "$pdf_file"; then 
        echo "  ✓ Already has meaningful text layer, skipping OCR" >> "$LOG_FILE"
        if mv "$pdf_file" "$OUTDIR/$pdf_name" 2>>"$LOG_FILE"; then
            echo "  → Moved to $OUTDIR" >> "$LOG_FILE"
			[ -f "$OUTDIR/$pdf_name" ] || echo "  ✗ Move failed" >> "$LOG_FILE"
        else
            echo "  ✗ Move failed" >> "$LOG_FILE"
        fi
    else
        echo "  ℹ Running OCRmyPDF" >> "$LOG_FILE"
        temp_out=$(mktemp)
        
        if docker run --rm \
            -v "$INDIR:/in:ro" \
            -v "$(dirname "$temp_out"):/tmp" \
            "$DOCKER_IMAGE" \
            -l "$LANGUAGES" \
            "/in/$pdf_name" \
            "/tmp/$(basename "$temp_out")" \
            2>>"$LOG_FILE"; then
            
            if mv "$temp_out" "$OUTDIR/$pdf_name" 2>>"$LOG_FILE"; then
			    /opt/bin/chmod --reference "$pdf_file" "$OUTDIR/$pdf_name"
				/opt/bin/chown --reference "$pdf_file" "$OUTDIR/$pdf_name"
                rm -f "$pdf_file"
                echo "  ✓ OCR complete" >> "$LOG_FILE"
			else
                echo "  ✗ Move failed" >> "$LOG_FILE"
				rm -f "$temp_out"
			fi
        else
            echo "  ✗ OCRmyPDF failed" >> "$LOG_FILE"
			rm -f "$temp_out"
		fi
    fi   
}

echo "Processing stale contents of $INDIR"

for pdf_file in "$INDIR"/*.pdf; do
  echo "[$(date '+%Y-%m-%d %H:%M:%S')] Processing: $pdf_file" 
  process_pdf_file "$pdf_file"
done

echo "Waiting for new files in $INDIR"

# Main inotifywait loop - runs in foreground
inotifywait -m -r "$INDIR" \
    -e moved_to,close_write \
   --format '%e %w%f' | \
while read event pdf_file; do
  [ -f "$pdf_file" ] || continue
  sleep 1    # Safeguard against race conditions
  [ -f "$pdf_file" ] || continue
  echo "[$(date '+%Y-%m-%d %H:%M:%S')] Calling processing loop for: $pdf_file" >> "$LOG_FILE" 
  process_pdf_file "$pdf_file"
done

