# kb_ascii_scrub_v1.0.ps1
# But : Diagnostiquer puis supprimer tous les caracteres non-ASCII des champs texte de la KB canonique
#       (folding : e->e, oe->oe, quotes/dashes unifies, etc.). PREVIEW = diagnostic sans ecriture.
#       EXECUTE = sanitation + ecriture SAFE-WRITE (UTF-8 BOM). PS 5.1, pas de ternaire.

[CmdletBinding()]
param(
  [switch]$Preview,
  [switch]$Execute,
  [string]$Root = "\\DS-918\chatgpt\ChatGPT-Gouvernance-Projets\_registry",
  [string]$StageRoot = "C:\Temp_Gouvernance"
)

function Ensure-Dir([string]$Path){ if(-not (Test-Path -LiteralPath $Path)){ New-Item -ItemType Directory -Force -Path $Path | Out-Null } }
function Ensure-Parent([string]$Target){ $p=Split-Path -Parent $Target; if($p){ Ensure-Dir $p } }
function Get-NowIso(){ (Get-Date).ToString("yyyy-MM-ddTHH:mm:ssK") }
function Read-JsonSansFooter([string]$Path){
  $sr = New-Object IO.StreamReader($Path)
  try{
    $L = New-Object 'System.Collections.Generic.List[string]'
    while(-not $sr.EndOfStream){
      $line = $sr.ReadLine()
      if($line -match '^\s*---\s*DOC-VERSION-FOOTER'){ break }
      $L.Add($line) | Out-Null
    }
    ($L -join "`n")
  } finally { $sr.Dispose() }
}
function Write-SafeText([string]$Target,[string]$Content,[string]$StageRoot){
  Ensure-Dir $StageRoot; Ensure-Parent $Target
  $tmp=Join-Path $StageRoot ("write_" + [IO.Path]::GetRandomFileName())
  $utf8=New-Object Text.UTF8Encoding($true)  # UTF-8 BOM
  [IO.File]::WriteAllText($tmp,$Content,$utf8)
  $bak=$null
  if(Test-Path -LiteralPath $Target){
    $bak=$Target+"."+(Get-Date -f yyyyMMdd_HHmmss)+".bak"
    Copy-Item -LiteralPath $Target -Destination $bak -Force
  }
  $tmpR="$Target.tmp"; Copy-Item -LiteralPath $tmp -Destination $tmpR -Force
  Move-Item -LiteralPath $tmpR -Destination $Target -Force
  Remove-Item -LiteralPath $tmp -Force
  return $bak
}
function Count-NonAscii([string]$s){
  if($null -eq $s){ return 0 }
  ([regex]::Matches($s,"[^\x09\x0A\x0D\x20-\x7E]")).Count
}
function List-NonAscii([string]$s, [int]$max=10){
  $r = New-Object 'System.Collections.Generic.Dictionary[string,int]'
  $matches = [regex]::Matches($s,"[^\x09\x0A\x0D\x20-\x7E]")
  foreach($m in $matches){
    $c = $m.Value
    if(-not $r.ContainsKey($c)){ $r[$c]=0 }
    $r[$c] = $r[$c] + 1
  }
  $items = @()
  foreach($k in $r.Keys){
    $code = [int][char]$k
    $hex = "U+" + ('{0:X4}' -f $code)
    $items += ,[pscustomobject]@{ Char=$k; Code=$hex; Count=$r[$k] }
  }
  $items | Sort-Object Count -Descending | Select-Object -First $max
}
function AsciiFold([string]$s){
  if($null -eq $s){ return "" }
  # Normalisation de quelques ponctuations typographiques courantes via \uXXXX
  $s = $s -replace "`u2018","'" -replace "`u2019","'" -replace "`u201A","'" -replace "`u201B","'"
  $s = $s -replace "`u201C",'"' -replace "`u201D",'"' -replace "`u201E",'"'
  $s = $s -replace "`u2013","-" -replace "`u2014","-"
  $s = $s -replace "`u00A0"," "
  # Decomposition Unicode + suppression des marques combinantes (accents)
  $norm = $s.Normalize([Text.NormalizationForm]::FormD)
  $sb = New-Object Text.StringBuilder
  foreach($ch in $norm.ToCharArray()){
    $cat = [Globalization.CharUnicodeInfo]::GetUnicodeCategory($ch)
    if($cat -ne [Globalization.UnicodeCategory]::NonSpacingMark){ [void]$sb.Append($ch) }
  }
  $s2 = $sb.ToString()
  # Remplacer tout le non-ASCII residuel par un espace
  $s2 = [regex]::Replace($s2,"[^\x09\x0A\x0D\x20-\x7E]"," ")
  # Neutraliser les hashtables stringifiees
  $s2 = $s2 -replace "@\{","("
  # Compacter espaces
  $s2 = [regex]::Replace($s2,"\s{2,}"," ")
  $s2.Trim()
}
function Sanitize-Entry($e){
  $tags=@()
  if($e.tags){ foreach($t in $e.tags){ $tags += (AsciiFold ([string]$t)) } }
  $seenIn=@()
  if($e.seen_in_threads){
    if($e.seen_in_threads -is [string]){ $seenIn=@(AsciiFold ([string]$e.seen_in_threads)) }
    else { $seenIn=@($e.seen_in_threads | ForEach-Object { AsciiFold ([string]$_) }) }
  }
  [ordered]@{
    id=AsciiFold ([string]$e.id)
    title=AsciiFold ([string]$e.title)
    blocking=[bool]$e.blocking
    workaround=AsciiFold ([string]$e.workaround)
    note=AsciiFold ([string]$e.note)
    fix=AsciiFold ([string]$e.fix)
    tags=@($tags)
    seen_in_threads=@($seenIn)
    last_seen=[string]$e.last_seen
  }
}

# Fichiers cibles
$KbPath = Join-Path (Join-Path $Root "bug_kb") "BUG_KB.json.txt"
if(-not (Test-Path -LiteralPath $KbPath)){ Write-Host "[ERR] KB absente: $KbPath"; exit 2 }

# Lire head + parse
$head = Read-JsonSansFooter $KbPath
$beforeNA = Count-NonAscii $head
$topNA = List-NonAscii $head 10

$kb = $null
try{ $kb = $head | ConvertFrom-Json -ErrorAction Stop } catch { Write-Host "[ERR] KB JSON invalide: $($_.Exception.Message)"; exit 3 }
if(-not $kb.entries){ $kb | Add-Member -Name entries -MemberType NoteProperty -Value @() }

# Construire version scrubbée (en memoire, taille modeste)
$clean = New-Object System.Collections.ArrayList
foreach($e in $kb.entries){ $null = $clean.Add( (Sanitize-Entry $e) ) }
$out = [ordered]@{ entries=@($clean); updated=Get-NowIso }
$json = ($out | ConvertTo-Json -Depth 6 -Compress)
$afterNA = Count-NonAscii $json

# PREVIEW
if($Preview -or (-not $Execute)){
  Write-Host "== PREVIEW :: KB ASCII SCRUB v1.0 =="
  Write-Host ("NonASCII before={0}  after_if_written={1}" -f $beforeNA,$afterNA)
  if($topNA -and $topNA.Count -gt 0){
    Write-Host "Top offending chars (first 10):"
    foreach($i in $topNA){ Write-Host (" - {0} (code {1})  count={2}" -f $i.Char,$i.Code,$i.Count) }
  } else {
    Write-Host "No offending chars detected."
  }
  Write-Host "No write performed (Preview)."
  exit 0
}

# EXECUTE
$footer = "`r`n`r`n--- DOC-VERSION-FOOTER ---`r`nGenerated: $($out.updated)`r`nPolicy: TXT-ONLY v1.0; SAFE-WRITE v1.1; GOV_SCRIPT_GATE v1.3`r`nSource: KB_ASCII_SCRUB_v1.0`r`n"
$bak = Write-SafeText -Target $KbPath -Content ($json + $footer) -StageRoot $StageRoot
$bakMsg = $bak; if(-not $bakMsg){ $bakMsg = "<none>" }
Write-Host ("[OK] KB scrub ASCII ecrite: {0}" -f $KbPath)
Write-Host ("Backup: {0}" -f $bakMsg)