256 lines
20 KiB
PowerShell
256 lines
20 KiB
PowerShell
|
function Get-FileEncoding {
|
|||
|
<#
|
|||
|
.SYNOPSIS
|
|||
|
Get a basic file encoding
|
|||
|
|
|||
|
.DESCRIPTION
|
|||
|
Get the file encoding in the most basic format
|
|||
|
|
|||
|
.PARAMETER Path
|
|||
|
The path to check. Can be a folder if you need all the files internally.
|
|||
|
|
|||
|
.PARAMETER MaxFileLength
|
|||
|
This is the maximum file size to try testing the whole file for when it might be Unicode or ASCII
|
|||
|
|
|||
|
.PARAMETER TestForUTF7
|
|||
|
UTF7 is a very unusual format to look for. Use sparingly.
|
|||
|
#>
|
|||
|
param(
|
|||
|
[Parameter(Mandatory=$true)]
|
|||
|
[ValidateNotNullOrEmpty()]
|
|||
|
$Path,
|
|||
|
[Parameter(Mandatory=$false)]
|
|||
|
[long]$MaxFileLength = 512000,
|
|||
|
[Parameter(Mandatory=$false)]
|
|||
|
[switch]$TestForUTF7
|
|||
|
)
|
|||
|
|
|||
|
$logLead = (Get-LogLeadName)
|
|||
|
|
|||
|
$Path = (Resolve-Path $Path)
|
|||
|
|
|||
|
Write-Verbose "$logLead : Checking $Path for file encoding"
|
|||
|
$returns = @()
|
|||
|
|
|||
|
$item = (Get-Item -Path $Path)
|
|||
|
if ($item.PSIsContainer) {
|
|||
|
$children = (Get-ChildItem -Path $Path)
|
|||
|
foreach($child in $children) {
|
|||
|
$returns += (Get-FileEncoding $child.FullName)
|
|||
|
}
|
|||
|
} else {
|
|||
|
$bytes = [byte[]](Get-Content $Path -Encoding Byte -ReadCount 4 -TotalCount 4)
|
|||
|
|
|||
|
$encoding = $null
|
|||
|
$description = $null
|
|||
|
|
|||
|
if (!(Test-IsCollectionNullOrEmpty $bytes)) {
|
|||
|
# Run the first four bytes (returned above) through this formatting string and check the file contents
|
|||
|
switch -regex ('{0:x2}{1:x2}{2:x2}{3:x2}' -f $bytes[0],$bytes[1],$bytes[2],$bytes[3]) {
|
|||
|
'^efbbbf' { $encoding = [System.Text.Encoding]::UTF8; $description = 'UTF-8 encoded Unicode byte order mark, commonly seen in text files.'; break; }
|
|||
|
'^fffe0000' { $encoding = [System.Text.Encoding]::UTF32; $description = 'UTF-8 encoded Unicode byte order mark little-endian 32-bit'; break; }
|
|||
|
'^fffe' { $encoding = [System.Text.Encoding]::Unicode; $description = 'UTF-8 encoded Unicode byte order mark little-endian 16-bit'; break; }
|
|||
|
'^feff' { $encoding = [System.Text.Encoding]::BigEndianUnicode; $description = 'UTF-8 encoded Unicode byte order mark (big-endian)'; break; }
|
|||
|
'^0000feff' { $encoding = [System.Text.Encoding]::UTF32; $description = 'UTF-32 encoded Unicode byte order mark'; break; }
|
|||
|
|
|||
|
# A lot of other non-text files that we may be curious what it is. This is a partial list. A lot of other file magic strings exist out there.
|
|||
|
'^0000000c' { $encoding = $null; $description = 'JPEG 2000 graphic file'; break; }
|
|||
|
'^00000018' { $encoding = $null; $description = 'Mpeg 4 video file'; break; }
|
|||
|
'^00000100' { $encoding = $null; $description = 'Computer icon encoded in ICO file format'; break; }
|
|||
|
'^000001b3' { $encoding = $null; $description = 'MPEG-1 video and MPEG-2 video (MPEG-1 Part 2 and MPEG-2 Part 2)'; break; }
|
|||
|
'^000001ba' { $encoding = $null; $description = 'MPEG Program Stream (MPEG-1 Part 1 (essentially identical) and MPEG-2 Part 1)'; break; }
|
|||
|
'^00010000' { $encoding = $null; $description = 'Palm Desktop Data File (Access format)'; break; }
|
|||
|
'^00014244' { $encoding = $null; $description = 'Palm Desktop To Do Archive'; break; }
|
|||
|
'^00014454' { $encoding = $null; $description = 'Palm Desktop Calendar Archive'; break; }
|
|||
|
'^0061736d' { $encoding = $null; $description = 'WebAssembly binary format'; break; }
|
|||
|
'^04224d18' { $encoding = $null; $description = 'LZ4 Frame Format'; break; }
|
|||
|
'^05070000' { $encoding = $null; $description = 'AppleWorks 5 document'; break; }
|
|||
|
'^0607e100' { $encoding = $null; $description = 'AppleWorks 6 document'; break; }
|
|||
|
'^0a0d0d0a' { $encoding = $null; $description = 'PCAP Next Generation Dump File Format'; break; }
|
|||
|
'^1a45dfa3' { $encoding = $null; $description = 'Matroska media container, including WebM'; break; }
|
|||
|
'^1b4c7561' { $encoding = $null; $description = 'Lua bytecode'; break; }
|
|||
|
'^1f8b' { $encoding = $null; $description = 'GZIP compressed file'; break; }
|
|||
|
'^1f9d' { $encoding = $null; $description = 'tar zip'; break; }
|
|||
|
'^1fa0' { $encoding = $null; $description = 'tar zip'; break; }
|
|||
|
'^20020162' { $encoding = $null; $description = 'Tableau Datasource'; break; }
|
|||
|
'^213c6172' { $encoding = $null; $description = 'linux deb file'; break; }
|
|||
|
'^2142444e' { $encoding = $null; $description = 'Outlook Post Office file'; break; }
|
|||
|
'^2321' { $encoding = $null; $description = 'Script or data to be passed to the program following the shebang (#!)'; break; }
|
|||
|
'^24534449' { $encoding = $null; $description = 'System Deployment Image, a disk image format used by Microsoft'; break; }
|
|||
|
'^2521' { $encoding = $null; $description = 'PostScript File'; break; }
|
|||
|
'^25504446' { $encoding = $null; $description = 'PDF Document'; break; }
|
|||
|
'^27051956' { $encoding = $null; $description = 'U-Boot / uImage. Das U-Boot Universal Boot Loader.'; break; }
|
|||
|
'^28b52ffd' { $encoding = $null; $description = 'Zstandard compressed file'; break; }
|
|||
|
'^3026b275' { $encoding = $null; $description = 'Windows Video file or Windows Audio file'; break; }
|
|||
|
'^3082' { $encoding = $null; $description = 'DER encoded X.509 certificate'; break; }
|
|||
|
'^310a3030' { $encoding = $null; $description = 'SubRip File'; break; }
|
|||
|
'^3412aa55' { $encoding = $null; $description = 'VPK file, used to store game data for some Source Engine games'; break; }
|
|||
|
'^37480302' { $encoding = $null; $description = 'KDB file'; break; }
|
|||
|
'^377abcaf' { $encoding = $null; $description = '7-Zip File Format'; break; }
|
|||
|
'^38425053' { $encoding = $null; $description = 'Photoshop Graphics'; break; }
|
|||
|
'^3a290a' { $encoding = $null; $description = 'Smile file'; break; }
|
|||
|
'^3d202020' { $encoding = $null; $description = 'Flexible Image Transport System (FITS)'; break; }
|
|||
|
'^3f5f0300' { $encoding = $null; $description = 'Help file'; break; }
|
|||
|
'^41474433' { $encoding = $null; $description = 'FreeHand 8 document'; break; }
|
|||
|
'^41542654' { $encoding = $null; $description = 'DjVu document'; break; }
|
|||
|
'^424d' { $encoding = $null; $description = 'Bitmap graphic'; break; }
|
|||
|
'^425047fb' { $encoding = $null; $description = 'Better Portable Graphics format'; break; }
|
|||
|
'^425a68' { $encoding = $null; $description = 'Compressed file using Bzip2 algorithm'; break; }
|
|||
|
'^435753' { $encoding = $null; $description = 'flash .swf'; break; }
|
|||
|
'^43723234' { $encoding = $null; $description = 'Google Chrome extension or packaged app'; break; }
|
|||
|
'^44434d01' { $encoding = $null; $description = 'Windows Update Binary Delta Compression'; break; }
|
|||
|
'^454d5533' { $encoding = $null; $description = 'Emulator III synth samples'; break; }
|
|||
|
'^454d5832' { $encoding = $null; $description = 'Emulator Emaxsynth samples'; break; }
|
|||
|
'^45520200' { $encoding = $null; $description = 'Roxio Toast disc image file, also some .dmg-files begin with same bytes'; break; }
|
|||
|
'^464c4946' { $encoding = $null; $description = 'Free Lossless Image Format'; break; }
|
|||
|
'^464c56' { $encoding = $null; $description = 'Flash Video'; break; }
|
|||
|
'^465753' { $encoding = $null; $description = 'Flash Shockwave'; break; }
|
|||
|
'^47494638' { $encoding = $null; $description = 'GIF graphic file'; break; }
|
|||
|
'^47' { $encoding = $null; $description = 'MPEG Transit Stream'; break; }
|
|||
|
'^494433' { $encoding = $null; $description = 'MP3 file with ID3 identity tag'; break; }
|
|||
|
'^4949' { $encoding = $null; $description = 'TIF graphic file'; break; }
|
|||
|
'^494e4458' { $encoding = $null; $description = 'Index file to a file or tape containing a backup done with AmiBack on an Amiga.'; break; }
|
|||
|
'^49545346' { $encoding = $null; $description = 'MS Windows HtmlHelp Data'; break; }
|
|||
|
'^4a6f7921' { $encoding = $null; $description = 'Preferred Executable Format'; break; }
|
|||
|
'^4b444d56' { $encoding = $null; $description = 'VMWare Disk file'; break; }
|
|||
|
'^4b444d' { $encoding = $null; $description = 'VMDK files'; break; }
|
|||
|
'^4c01' { $encoding = $null; $description = 'Object Code File'; break; }
|
|||
|
'^4c5a4950' { $encoding = $null; $description = 'lzip compressed file'; break; }
|
|||
|
'^4d4c5649' { $encoding = $null; $description = 'Magic Lantern Video file'; break; }
|
|||
|
'^4d4d002a' { $encoding = $null; $description = 'Tagged Image File Format (TIFF) (big-endian format)'; break; }
|
|||
|
'^4d534346' { $encoding = $null; $description = 'CAB Installer file'; break; }
|
|||
|
'^4d546864' { $encoding = $null; $description = 'MIDI sound file'; break; }
|
|||
|
'^4d5a' { $encoding = $null; $description = 'Windows MZ/PE/NE or SYS (driver) file'; break; }
|
|||
|
'^4d696372' { $encoding = $null; $description = 'Microsoft Build System File (pdb, etc)'; break; }
|
|||
|
'^4e45531a' { $encoding = $null; $description = 'Nintendo Entertainment System ROM file'; break; }
|
|||
|
'^4f4152' { $encoding = $null; $description = 'OAR file archive format, where ?? is the format version.'; break; }
|
|||
|
'^4f5243' { $encoding = $null; $description = 'Apache ORC (Optimized Row Columnar) file format'; break; }
|
|||
|
'^4f626a01' { $encoding = $null; $description = 'Apache Avro binary file format'; break; }
|
|||
|
'^4f676753' { $encoding = $null; $description = 'Ogg, an open source media container format'; break; }
|
|||
|
'^50415231' { $encoding = $null; $description = 'Apache Parquet columnar file format'; break; }
|
|||
|
'^504b0304' { $encoding = $null; $description = 'ZIP file (or masquerading file, such as Nuget, Choco, EPUB, JAR, ODF, OOXML, Office document)'; break; }
|
|||
|
'^504b0506' { $encoding = $null; $description = 'zip file (empty archive)'; break; }
|
|||
|
'^504b0708' { $encoding = $null; $description = 'zip file (spanned archive)'; break; }
|
|||
|
'^504d4f43' { $encoding = $null; $description = 'Windows Files And Settings Transfer Repository'; break; }
|
|||
|
'^52494646' { $encoding = $null; $description = 'AVI video file or WAV audio file'; break; }
|
|||
|
'^524e4301' { $encoding = $null; $description = 'Compressed file using Rob Northen Compression (version 1 and 2) algorithm'; break; }
|
|||
|
'^524e4302' { $encoding = $null; $description = 'Compressed file using Rob Northen Compression (version 1 and 2) algorithm'; break; }
|
|||
|
'^5253564b' { $encoding = $null; $description = 'QuickZip rs compressed archive'; break; }
|
|||
|
'^52617221' { $encoding = $null; $description = 'RAR file'; break; }
|
|||
|
'^52656365' { $encoding = $null; $description = 'Email Message var5'; break; }
|
|||
|
'^53445058' { $encoding = $null; $description = 'SMPTE DPX image (big-endian format)'; break; }
|
|||
|
'^53455136' { $encoding = $null; $description = 'RCFile columnar file format'; break; }
|
|||
|
'^53494d50' { $encoding = $null; $description = 'Flexible Image Transport System (FITS)'; break; }
|
|||
|
'^53503031' { $encoding = $null; $description = 'Amazon Kindle Update Package'; break; }
|
|||
|
'^53514c69' { $encoding = $null; $description = 'SQLite Database'; break; }
|
|||
|
'^535a4444' { $encoding = $null; $description = 'Microsoft compressed file. File can be decompressed using Extract.exe/Expand.exe distributed with earlier versions of Windows.'; break; }
|
|||
|
'^5374616e' { $encoding = $null; $description = 'Microsoft Database'; break; }
|
|||
|
'^54415045' { $encoding = $null; $description = 'Microsoft Tape Format'; break; }
|
|||
|
'^54444546' { $encoding = $null; $description = 'Telegram Desktop Encrypted File'; break; }
|
|||
|
'^54444624' { $encoding = $null; $description = 'Telegram Desktop File'; break; }
|
|||
|
'^5555aaaa' { $encoding = $null; $description = 'PhotoCap Vector'; break; }
|
|||
|
'^58464952' { $encoding = $null; $description = 'Adobe Shockwave'; break; }
|
|||
|
'^58504453' { $encoding = $null; $description = 'SMPTE DPX image (little-endian format)'; break; }
|
|||
|
'^5a4d' { $encoding = $null; $description = 'DOS ZM executable file format and its descendants (rare)'; break; }
|
|||
|
'^5b5a6f6e' { $encoding = $null; $description = 'Microsoft Zone Identifier for URL Security Zones'; break; }
|
|||
|
'^626f6f6b' { $encoding = $null; $description = 'macOS file Alias (Symbolic link)'; break; }
|
|||
|
'^62767832' { $encoding = $null; $description = 'LZFSE - Lempel-Ziv style data compression algorithm using Finite State Entropy coding. OSS by Apple.'; break; }
|
|||
|
'^6465780a' { $encoding = $null; $description = 'Dalvik Executable'; break; }
|
|||
|
'^65877856' { $encoding = $null; $description = 'PhotoCap Object Templates'; break; }
|
|||
|
'^664c6143' { $encoding = $null; $description = 'Free Lossless Audio Codec'; break; }
|
|||
|
'^6d6f6f76' { $encoding = $null; $description = 'MOV video file'; break; }
|
|||
|
'^746f7833' { $encoding = $null; $description = 'Open source portable voxel file'; break; }
|
|||
|
'^75737461' { $encoding = $null; $description = 'Tar file'; break; }
|
|||
|
'^762f3101' { $encoding = $null; $description = 'OpenEXR image'; break; }
|
|||
|
'^774f4632' { $encoding = $null; $description = 'WOFF File Format 2.0'; break; }
|
|||
|
'^774f4646' { $encoding = $null; $description = 'WOFF File Format 1.0'; break; }
|
|||
|
'^7801' { $encoding = $null; $description = 'zlib - No Compression (no preset dictionary)'; break; }
|
|||
|
'^7801730d' { $encoding = $null; $description = 'Apple Disk Image file'; break; }
|
|||
|
'^7820' { $encoding = $null; $description = 'zlib - No Compression (with preset dictionary)'; break; }
|
|||
|
'^785634' { $encoding = $null; $description = 'PhotoCap Template'; break; }
|
|||
|
'^785e' { $encoding = $null; $description = 'zlib - Best speed (no preset dictionary)'; break; }
|
|||
|
'^78617221' { $encoding = $null; $description = 'eXtensible ARchive format'; break; }
|
|||
|
'^787d' { $encoding = $null; $description = 'zlib - Best speed (with preset dictionary)'; break; }
|
|||
|
'^789c' { $encoding = $null; $description = 'zlib - Default Compression (no preset dictionary)'; break; }
|
|||
|
'^78bb' { $encoding = $null; $description = 'zlib - Default Compression (with preset dictionary)'; break; }
|
|||
|
'^78da' { $encoding = $null; $description = 'zlib - Best Compression (no preset dictionary)'; break; }
|
|||
|
'^78f9' { $encoding = $null; $description = 'zlib - Best Compression (with preset dictionary)'; break; }
|
|||
|
'^7b5c7274' { $encoding = $null; $description = 'Rich Text Format'; break; }
|
|||
|
'^7f454c46' { $encoding = $null; $description = 'Executable and Linkable Format'; break; }
|
|||
|
'^802a5fd7' { $encoding = $null; $description = 'Kodak Cineon image'; break; }
|
|||
|
'^85' { $encoding = $null; $description = 'PGP file'; break; }
|
|||
|
'^89504e47' { $encoding = $null; $description = 'PNG graphic file'; break; }
|
|||
|
'^8b455202' { $encoding = $null; $description = 'Roxio Toast disc image file, also some .dmg-files begin with same bytes'; break; }
|
|||
|
'^a1b2c3d4' { $encoding = $null; $description = 'Libpcap File Format (big-endian)'; break; }
|
|||
|
'^bebafeca' { $encoding = $null; $description = 'Palm Desktop Calendar Archive'; break; }
|
|||
|
'^c9' { $encoding = $null; $description = 'CP/M 3 and higher with overlays'; break; }
|
|||
|
'^cafebabe' { $encoding = $null; $description = 'Java class file, Mach-O Fat Binary'; break; }
|
|||
|
'^cefaedfe' { $encoding = $null; $description = 'Mach-O binary (reverse byte ordering scheme, 32-bit)'; break; }
|
|||
|
'^cf8401' { $encoding = $null; $description = 'Lepton compressed JPEG image'; break; }
|
|||
|
'^cffaedfe' { $encoding = $null; $description = 'Mach-O binary (reverse byte ordering scheme, 64-bit)'; break; }
|
|||
|
'^d0cf11e0' { $encoding = $null; $description = 'Office Document'; break; }
|
|||
|
'^d4c3b2a1' { $encoding = $null; $description = 'Libpcap File Format (little-endian)'; break; }
|
|||
|
'^d7cdc69a' { $encoding = $null; $description = 'Windows Meta File'; break; }
|
|||
|
'^edabeedb' { $encoding = $null; $description = 'RedHat Package Manager (RPM) package'; break; }
|
|||
|
'^fd377a58' { $encoding = $null; $description = 'XZ compression utility using LZMA2 compression'; break; }
|
|||
|
'^feedfeed' { $encoding = $null; $description = 'JKS JavakeyStore'; break; }
|
|||
|
'^ffd8' { $encoding = $null; $description = 'jpg'; break; }
|
|||
|
'^fff2' { $encoding = $null; $description = 'MPEG-1 Layer 3 file'; break; }
|
|||
|
'^fff3' { $encoding = $null; $description = 'MPEG-1 Layer 3 file'; break; }
|
|||
|
'^fffb' { $encoding = $null; $description = 'MPEG-1 Layer 3 file'; break; }
|
|||
|
}
|
|||
|
|
|||
|
if ($null -eq $description) {
|
|||
|
if ($item.Length -lt $MaxFileLength) {
|
|||
|
$bytes = [byte[]](Get-Content $Path -Encoding Byte -Raw)
|
|||
|
}
|
|||
|
$isAscii = $true
|
|||
|
$byteCounter = 0
|
|||
|
foreach($byte in $bytes) {
|
|||
|
$byteCounter += 1
|
|||
|
$convertedByte = [byte]([char]$byte)
|
|||
|
# If it is a tab (9), LF (10), CR (13) or alphanumeric+symbols (32-126), it's valid
|
|||
|
$whitespaceChars = $convertedByte -eq 9 -or $convertedByte -eq 10 -or $convertedByte -eq 13
|
|||
|
if ($whitespaceChars) {
|
|||
|
continue
|
|||
|
}
|
|||
|
|
|||
|
if ($convertedByte -gt 126 -or $convertedByte -lt 32) {
|
|||
|
# Byte is outside the typical range of bytes for ASCII text
|
|||
|
$isAscii = $false
|
|||
|
$description = "File appears to contain non-ASCII characters (current byte value: $convertedByte at counter: $byteCounter)"
|
|||
|
break
|
|||
|
}
|
|||
|
}
|
|||
|
if ($isAscii) {
|
|||
|
$utf7TestResult = $false
|
|||
|
# You can't know if something is UTF7 without reading it entirely in and trying to parse as UTF7.
|
|||
|
# UTF7 is literally "encode Unicode as ASCII" much as base64 encoding might
|
|||
|
if ($TestForUTF7) {
|
|||
|
Write-Host "$logLead : Testing for UTF7"
|
|||
|
$asciiTextRaw = (Get-Content -Path $Path -Encoding ASCII -Raw)
|
|||
|
$utf7TextRaw = (Get-Content -Path $Path -Encoding UTF7 -Raw)
|
|||
|
|
|||
|
if ($asciiTextRaw -ne $utf7TextRaw) {
|
|||
|
$encoding = [System.Text.Encoding]::UTF7
|
|||
|
$description = "File appears to be UTF7 encoded"
|
|||
|
|
|||
|
$utf7TestResult = $true
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
if (!$utf7TestResult) {
|
|||
|
$encoding = [System.Text.Encoding]::ASCII
|
|||
|
$description = "File appears to only contain ASCII characters"
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
} else {
|
|||
|
$description = "Zero-byte file, can not determine encoding"
|
|||
|
}
|
|||
|
|
|||
|
$returns += @(@{File = $Path.Path; Encoding = $encoding; Description = $description; })
|
|||
|
}
|
|||
|
|
|||
|
return $returns
|
|||
|
}
|