??# kb_ascii_scrub_v1.0.ps1 # But : Diagnostiquer puis supprimer tous les caracteres non-ASCII des champs texte de la KB canonique # (folding : e->e, oe->oe, quotes/dashes unifies, etc.). PREVIEW = diagnostic sans ecriture. # EXECUTE = sanitation + ecriture SAFE-WRITE (UTF-8 BOM). PS 5.1, pas de ternaire. [CmdletBinding()] param( [switch]$Preview, [switch]$Execute, [string]$Root = "\\DS-918\chatgpt\ChatGPT-Gouvernance-Projets\_registry", [string]$StageRoot = "C:\Temp_Gouvernance" ) function Ensure-Dir([string]$Path){ if(-not (Test-Path -LiteralPath $Path)){ New-Item -ItemType Directory -Force -Path $Path | Out-Null } } function Ensure-Parent([string]$Target){ $p=Split-Path -Parent $Target; if($p){ Ensure-Dir $p } } function Get-NowIso(){ (Get-Date).ToString("yyyy-MM-ddTHH:mm:ssK") } function Read-JsonSansFooter([string]$Path){ $sr = New-Object IO.StreamReader($Path) try{ $L = New-Object 'System.Collections.Generic.List[string]' while(-not $sr.EndOfStream){ $line = $sr.ReadLine() if($line -match '^\s*---\s*DOC-VERSION-FOOTER'){ break } $L.Add($line) | Out-Null } ($L -join "`n") } finally { $sr.Dispose() } } function Write-SafeText([string]$Target,[string]$Content,[string]$StageRoot){ Ensure-Dir $StageRoot; Ensure-Parent $Target $tmp=Join-Path $StageRoot ("write_" + [IO.Path]::GetRandomFileName()) $utf8=New-Object Text.UTF8Encoding($true) # UTF-8 BOM [IO.File]::WriteAllText($tmp,$Content,$utf8) $bak=$null if(Test-Path -LiteralPath $Target){ $bak=$Target+"."+(Get-Date -f yyyyMMdd_HHmmss)+".bak" Copy-Item -LiteralPath $Target -Destination $bak -Force } $tmpR="$Target.tmp"; Copy-Item -LiteralPath $tmp -Destination $tmpR -Force Move-Item -LiteralPath $tmpR -Destination $Target -Force Remove-Item -LiteralPath $tmp -Force return $bak } function Count-NonAscii([string]$s){ if($null -eq $s){ return 0 } ([regex]::Matches($s,"[^\x09\x0A\x0D\x20-\x7E]")).Count } function List-NonAscii([string]$s, [int]$max=10){ $r = New-Object 'System.Collections.Generic.Dictionary[string,int]' $matches = [regex]::Matches($s,"[^\x09\x0A\x0D\x20-\x7E]") foreach($m in $matches){ $c = $m.Value if(-not $r.ContainsKey($c)){ $r[$c]=0 } $r[$c] = $r[$c] + 1 } $items = @() foreach($k in $r.Keys){ $code = [int][char]$k $hex = "U+" + ('{0:X4}' -f $code) $items += ,[pscustomobject]@{ Char=$k; Code=$hex; Count=$r[$k] } } $items | Sort-Object Count -Descending | Select-Object -First $max } function AsciiFold([string]$s){ if($null -eq $s){ return "" } # Normalisation de quelques ponctuations typographiques courantes via \uXXXX $s = $s -replace "`u2018","'" -replace "`u2019","'" -replace "`u201A","'" -replace "`u201B","'" $s = $s -replace "`u201C",'"' -replace "`u201D",'"' -replace "`u201E",'"' $s = $s -replace "`u2013","-" -replace "`u2014","-" $s = $s -replace "`u00A0"," " # Decomposition Unicode + suppression des marques combinantes (accents) $norm = $s.Normalize([Text.NormalizationForm]::FormD) $sb = New-Object Text.StringBuilder foreach($ch in $norm.ToCharArray()){ $cat = [Globalization.CharUnicodeInfo]::GetUnicodeCategory($ch) if($cat -ne [Globalization.UnicodeCategory]::NonSpacingMark){ [void]$sb.Append($ch) } } $s2 = $sb.ToString() # Remplacer tout le non-ASCII residuel par un espace $s2 = [regex]::Replace($s2,"[^\x09\x0A\x0D\x20-\x7E]"," ") # Neutraliser les hashtables stringifiees $s2 = $s2 -replace "@\{","(" # Compacter espaces $s2 = [regex]::Replace($s2,"\s{2,}"," ") $s2.Trim() } function Sanitize-Entry($e){ $tags=@() if($e.tags){ foreach($t in $e.tags){ $tags += (AsciiFold ([string]$t)) } } $seenIn=@() if($e.seen_in_threads){ if($e.seen_in_threads -is [string]){ $seenIn=@(AsciiFold ([string]$e.seen_in_threads)) } else { $seenIn=@($e.seen_in_threads | ForEach-Object { AsciiFold ([string]$_) }) } } [ordered]@{ id=AsciiFold ([string]$e.id) title=AsciiFold ([string]$e.title) blocking=[bool]$e.blocking workaround=AsciiFold ([string]$e.workaround) note=AsciiFold ([string]$e.note) fix=AsciiFold ([string]$e.fix) tags=@($tags) seen_in_threads=@($seenIn) last_seen=[string]$e.last_seen } } # Fichiers cibles $KbPath = Join-Path (Join-Path $Root "bug_kb") "BUG_KB.json.txt" if(-not (Test-Path -LiteralPath $KbPath)){ Write-Host "[ERR] KB absente: $KbPath"; exit 2 } # Lire head + parse $head = Read-JsonSansFooter $KbPath $beforeNA = Count-NonAscii $head $topNA = List-NonAscii $head 10 $kb = $null try{ $kb = $head | ConvertFrom-Json -ErrorAction Stop } catch { Write-Host "[ERR] KB JSON invalide: $($_.Exception.Message)"; exit 3 } if(-not $kb.entries){ $kb | Add-Member -Name entries -MemberType NoteProperty -Value @() } # Construire version scrubb?e (en memoire, taille modeste) $clean = New-Object System.Collections.ArrayList foreach($e in $kb.entries){ $null = $clean.Add( (Sanitize-Entry $e) ) } $out = [ordered]@{ entries=@($clean); updated=Get-NowIso } $json = ($out | ConvertTo-Json -Depth 6 -Compress) $afterNA = Count-NonAscii $json # PREVIEW if($Preview -or (-not $Execute)){ Write-Host "== PREVIEW :: KB ASCII SCRUB v1.0 ==" Write-Host ("NonASCII before={0} after_if_written={1}" -f $beforeNA,$afterNA) if($topNA -and $topNA.Count -gt 0){ Write-Host "Top offending chars (first 10):" foreach($i in $topNA){ Write-Host (" - {0} (code {1}) count={2}" -f $i.Char,$i.Code,$i.Count) } } else { Write-Host "No offending chars detected." } Write-Host "No write performed (Preview)." exit 0 } # EXECUTE $footer = "`r`n`r`n--- DOC-VERSION-FOOTER ---`r`nGenerated: $($out.updated)`r`nPolicy: TXT-ONLY v1.0; SAFE-WRITE v1.1; GOV_SCRIPT_GATE v1.3`r`nSource: KB_ASCII_SCRUB_v1.0`r`n" $bak = Write-SafeText -Target $KbPath -Content ($json + $footer) -StageRoot $StageRoot $bakMsg = $bak; if(-not $bakMsg){ $bakMsg = "" } Write-Host ("[OK] KB scrub ASCII ecrite: {0}" -f $KbPath) Write-Host ("Backup: {0}" -f $bakMsg)