ps/Modules/Alkami.PowerShell.Common/Public/Get-FileEncoding.ps1
2023-05-30 22:51:22 -07:00

256 lines
20 KiB
PowerShell

function Get-FileEncoding {
<#
.SYNOPSIS
Get a basic file encoding
.DESCRIPTION
Get the file encoding in the most basic format
.PARAMETER Path
The path to check. Can be a folder if you need all the files internally.
.PARAMETER MaxFileLength
This is the maximum file size to try testing the whole file for when it might be Unicode or ASCII
.PARAMETER TestForUTF7
UTF7 is a very unusual format to look for. Use sparingly.
#>
param(
[Parameter(Mandatory=$true)]
[ValidateNotNullOrEmpty()]
$Path,
[Parameter(Mandatory=$false)]
[long]$MaxFileLength = 512000,
[Parameter(Mandatory=$false)]
[switch]$TestForUTF7
)
$logLead = (Get-LogLeadName)
$Path = (Resolve-Path $Path)
Write-Verbose "$logLead : Checking $Path for file encoding"
$returns = @()
$item = (Get-Item -Path $Path)
if ($item.PSIsContainer) {
$children = (Get-ChildItem -Path $Path)
foreach($child in $children) {
$returns += (Get-FileEncoding $child.FullName)
}
} else {
$bytes = [byte[]](Get-Content $Path -Encoding Byte -ReadCount 4 -TotalCount 4)
$encoding = $null
$description = $null
if (!(Test-IsCollectionNullOrEmpty $bytes)) {
# Run the first four bytes (returned above) through this formatting string and check the file contents
switch -regex ('{0:x2}{1:x2}{2:x2}{3:x2}' -f $bytes[0],$bytes[1],$bytes[2],$bytes[3]) {
'^efbbbf' { $encoding = [System.Text.Encoding]::UTF8; $description = 'UTF-8 encoded Unicode byte order mark, commonly seen in text files.'; break; }
'^fffe0000' { $encoding = [System.Text.Encoding]::UTF32; $description = 'UTF-8 encoded Unicode byte order mark little-endian 32-bit'; break; }
'^fffe' { $encoding = [System.Text.Encoding]::Unicode; $description = 'UTF-8 encoded Unicode byte order mark little-endian 16-bit'; break; }
'^feff' { $encoding = [System.Text.Encoding]::BigEndianUnicode; $description = 'UTF-8 encoded Unicode byte order mark (big-endian)'; break; }
'^0000feff' { $encoding = [System.Text.Encoding]::UTF32; $description = 'UTF-32 encoded Unicode byte order mark'; break; }
# A lot of other non-text files that we may be curious what it is. This is a partial list. A lot of other file magic strings exist out there.
'^0000000c' { $encoding = $null; $description = 'JPEG 2000 graphic file'; break; }
'^00000018' { $encoding = $null; $description = 'Mpeg 4 video file'; break; }
'^00000100' { $encoding = $null; $description = 'Computer icon encoded in ICO file format'; break; }
'^000001b3' { $encoding = $null; $description = 'MPEG-1 video and MPEG-2 video (MPEG-1 Part 2 and MPEG-2 Part 2)'; break; }
'^000001ba' { $encoding = $null; $description = 'MPEG Program Stream (MPEG-1 Part 1 (essentially identical) and MPEG-2 Part 1)'; break; }
'^00010000' { $encoding = $null; $description = 'Palm Desktop Data File (Access format)'; break; }
'^00014244' { $encoding = $null; $description = 'Palm Desktop To Do Archive'; break; }
'^00014454' { $encoding = $null; $description = 'Palm Desktop Calendar Archive'; break; }
'^0061736d' { $encoding = $null; $description = 'WebAssembly binary format'; break; }
'^04224d18' { $encoding = $null; $description = 'LZ4 Frame Format'; break; }
'^05070000' { $encoding = $null; $description = 'AppleWorks 5 document'; break; }
'^0607e100' { $encoding = $null; $description = 'AppleWorks 6 document'; break; }
'^0a0d0d0a' { $encoding = $null; $description = 'PCAP Next Generation Dump File Format'; break; }
'^1a45dfa3' { $encoding = $null; $description = 'Matroska media container, including WebM'; break; }
'^1b4c7561' { $encoding = $null; $description = 'Lua bytecode'; break; }
'^1f8b' { $encoding = $null; $description = 'GZIP compressed file'; break; }
'^1f9d' { $encoding = $null; $description = 'tar zip'; break; }
'^1fa0' { $encoding = $null; $description = 'tar zip'; break; }
'^20020162' { $encoding = $null; $description = 'Tableau Datasource'; break; }
'^213c6172' { $encoding = $null; $description = 'linux deb file'; break; }
'^2142444e' { $encoding = $null; $description = 'Outlook Post Office file'; break; }
'^2321' { $encoding = $null; $description = 'Script or data to be passed to the program following the shebang (#!)'; break; }
'^24534449' { $encoding = $null; $description = 'System Deployment Image, a disk image format used by Microsoft'; break; }
'^2521' { $encoding = $null; $description = 'PostScript File'; break; }
'^25504446' { $encoding = $null; $description = 'PDF Document'; break; }
'^27051956' { $encoding = $null; $description = 'U-Boot / uImage. Das U-Boot Universal Boot Loader.'; break; }
'^28b52ffd' { $encoding = $null; $description = 'Zstandard compressed file'; break; }
'^3026b275' { $encoding = $null; $description = 'Windows Video file or Windows Audio file'; break; }
'^3082' { $encoding = $null; $description = 'DER encoded X.509 certificate'; break; }
'^310a3030' { $encoding = $null; $description = 'SubRip File'; break; }
'^3412aa55' { $encoding = $null; $description = 'VPK file, used to store game data for some Source Engine games'; break; }
'^37480302' { $encoding = $null; $description = 'KDB file'; break; }
'^377abcaf' { $encoding = $null; $description = '7-Zip File Format'; break; }
'^38425053' { $encoding = $null; $description = 'Photoshop Graphics'; break; }
'^3a290a' { $encoding = $null; $description = 'Smile file'; break; }
'^3d202020' { $encoding = $null; $description = 'Flexible Image Transport System (FITS)'; break; }
'^3f5f0300' { $encoding = $null; $description = 'Help file'; break; }
'^41474433' { $encoding = $null; $description = 'FreeHand 8 document'; break; }
'^41542654' { $encoding = $null; $description = 'DjVu document'; break; }
'^424d' { $encoding = $null; $description = 'Bitmap graphic'; break; }
'^425047fb' { $encoding = $null; $description = 'Better Portable Graphics format'; break; }
'^425a68' { $encoding = $null; $description = 'Compressed file using Bzip2 algorithm'; break; }
'^435753' { $encoding = $null; $description = 'flash .swf'; break; }
'^43723234' { $encoding = $null; $description = 'Google Chrome extension or packaged app'; break; }
'^44434d01' { $encoding = $null; $description = 'Windows Update Binary Delta Compression'; break; }
'^454d5533' { $encoding = $null; $description = 'Emulator III synth samples'; break; }
'^454d5832' { $encoding = $null; $description = 'Emulator Emaxsynth samples'; break; }
'^45520200' { $encoding = $null; $description = 'Roxio Toast disc image file, also some .dmg-files begin with same bytes'; break; }
'^464c4946' { $encoding = $null; $description = 'Free Lossless Image Format'; break; }
'^464c56' { $encoding = $null; $description = 'Flash Video'; break; }
'^465753' { $encoding = $null; $description = 'Flash Shockwave'; break; }
'^47494638' { $encoding = $null; $description = 'GIF graphic file'; break; }
'^47' { $encoding = $null; $description = 'MPEG Transit Stream'; break; }
'^494433' { $encoding = $null; $description = 'MP3 file with ID3 identity tag'; break; }
'^4949' { $encoding = $null; $description = 'TIF graphic file'; break; }
'^494e4458' { $encoding = $null; $description = 'Index file to a file or tape containing a backup done with AmiBack on an Amiga.'; break; }
'^49545346' { $encoding = $null; $description = 'MS Windows HtmlHelp Data'; break; }
'^4a6f7921' { $encoding = $null; $description = 'Preferred Executable Format'; break; }
'^4b444d56' { $encoding = $null; $description = 'VMWare Disk file'; break; }
'^4b444d' { $encoding = $null; $description = 'VMDK files'; break; }
'^4c01' { $encoding = $null; $description = 'Object Code File'; break; }
'^4c5a4950' { $encoding = $null; $description = 'lzip compressed file'; break; }
'^4d4c5649' { $encoding = $null; $description = 'Magic Lantern Video file'; break; }
'^4d4d002a' { $encoding = $null; $description = 'Tagged Image File Format (TIFF) (big-endian format)'; break; }
'^4d534346' { $encoding = $null; $description = 'CAB Installer file'; break; }
'^4d546864' { $encoding = $null; $description = 'MIDI sound file'; break; }
'^4d5a' { $encoding = $null; $description = 'Windows MZ/PE/NE or SYS (driver) file'; break; }
'^4d696372' { $encoding = $null; $description = 'Microsoft Build System File (pdb, etc)'; break; }
'^4e45531a' { $encoding = $null; $description = 'Nintendo Entertainment System ROM file'; break; }
'^4f4152' { $encoding = $null; $description = 'OAR file archive format, where ?? is the format version.'; break; }
'^4f5243' { $encoding = $null; $description = 'Apache ORC (Optimized Row Columnar) file format'; break; }
'^4f626a01' { $encoding = $null; $description = 'Apache Avro binary file format'; break; }
'^4f676753' { $encoding = $null; $description = 'Ogg, an open source media container format'; break; }
'^50415231' { $encoding = $null; $description = 'Apache Parquet columnar file format'; break; }
'^504b0304' { $encoding = $null; $description = 'ZIP file (or masquerading file, such as Nuget, Choco, EPUB, JAR, ODF, OOXML, Office document)'; break; }
'^504b0506' { $encoding = $null; $description = 'zip file (empty archive)'; break; }
'^504b0708' { $encoding = $null; $description = 'zip file (spanned archive)'; break; }
'^504d4f43' { $encoding = $null; $description = 'Windows Files And Settings Transfer Repository'; break; }
'^52494646' { $encoding = $null; $description = 'AVI video file or WAV audio file'; break; }
'^524e4301' { $encoding = $null; $description = 'Compressed file using Rob Northen Compression (version 1 and 2) algorithm'; break; }
'^524e4302' { $encoding = $null; $description = 'Compressed file using Rob Northen Compression (version 1 and 2) algorithm'; break; }
'^5253564b' { $encoding = $null; $description = 'QuickZip rs compressed archive'; break; }
'^52617221' { $encoding = $null; $description = 'RAR file'; break; }
'^52656365' { $encoding = $null; $description = 'Email Message var5'; break; }
'^53445058' { $encoding = $null; $description = 'SMPTE DPX image (big-endian format)'; break; }
'^53455136' { $encoding = $null; $description = 'RCFile columnar file format'; break; }
'^53494d50' { $encoding = $null; $description = 'Flexible Image Transport System (FITS)'; break; }
'^53503031' { $encoding = $null; $description = 'Amazon Kindle Update Package'; break; }
'^53514c69' { $encoding = $null; $description = 'SQLite Database'; break; }
'^535a4444' { $encoding = $null; $description = 'Microsoft compressed file. File can be decompressed using Extract.exe/Expand.exe distributed with earlier versions of Windows.'; break; }
'^5374616e' { $encoding = $null; $description = 'Microsoft Database'; break; }
'^54415045' { $encoding = $null; $description = 'Microsoft Tape Format'; break; }
'^54444546' { $encoding = $null; $description = 'Telegram Desktop Encrypted File'; break; }
'^54444624' { $encoding = $null; $description = 'Telegram Desktop File'; break; }
'^5555aaaa' { $encoding = $null; $description = 'PhotoCap Vector'; break; }
'^58464952' { $encoding = $null; $description = 'Adobe Shockwave'; break; }
'^58504453' { $encoding = $null; $description = 'SMPTE DPX image (little-endian format)'; break; }
'^5a4d' { $encoding = $null; $description = 'DOS ZM executable file format and its descendants (rare)'; break; }
'^5b5a6f6e' { $encoding = $null; $description = 'Microsoft Zone Identifier for URL Security Zones'; break; }
'^626f6f6b' { $encoding = $null; $description = 'macOS file Alias (Symbolic link)'; break; }
'^62767832' { $encoding = $null; $description = 'LZFSE - Lempel-Ziv style data compression algorithm using Finite State Entropy coding. OSS by Apple.'; break; }
'^6465780a' { $encoding = $null; $description = 'Dalvik Executable'; break; }
'^65877856' { $encoding = $null; $description = 'PhotoCap Object Templates'; break; }
'^664c6143' { $encoding = $null; $description = 'Free Lossless Audio Codec'; break; }
'^6d6f6f76' { $encoding = $null; $description = 'MOV video file'; break; }
'^746f7833' { $encoding = $null; $description = 'Open source portable voxel file'; break; }
'^75737461' { $encoding = $null; $description = 'Tar file'; break; }
'^762f3101' { $encoding = $null; $description = 'OpenEXR image'; break; }
'^774f4632' { $encoding = $null; $description = 'WOFF File Format 2.0'; break; }
'^774f4646' { $encoding = $null; $description = 'WOFF File Format 1.0'; break; }
'^7801' { $encoding = $null; $description = 'zlib - No Compression (no preset dictionary)'; break; }
'^7801730d' { $encoding = $null; $description = 'Apple Disk Image file'; break; }
'^7820' { $encoding = $null; $description = 'zlib - No Compression (with preset dictionary)'; break; }
'^785634' { $encoding = $null; $description = 'PhotoCap Template'; break; }
'^785e' { $encoding = $null; $description = 'zlib - Best speed (no preset dictionary)'; break; }
'^78617221' { $encoding = $null; $description = 'eXtensible ARchive format'; break; }
'^787d' { $encoding = $null; $description = 'zlib - Best speed (with preset dictionary)'; break; }
'^789c' { $encoding = $null; $description = 'zlib - Default Compression (no preset dictionary)'; break; }
'^78bb' { $encoding = $null; $description = 'zlib - Default Compression (with preset dictionary)'; break; }
'^78da' { $encoding = $null; $description = 'zlib - Best Compression (no preset dictionary)'; break; }
'^78f9' { $encoding = $null; $description = 'zlib - Best Compression (with preset dictionary)'; break; }
'^7b5c7274' { $encoding = $null; $description = 'Rich Text Format'; break; }
'^7f454c46' { $encoding = $null; $description = 'Executable and Linkable Format'; break; }
'^802a5fd7' { $encoding = $null; $description = 'Kodak Cineon image'; break; }
'^85' { $encoding = $null; $description = 'PGP file'; break; }
'^89504e47' { $encoding = $null; $description = 'PNG graphic file'; break; }
'^8b455202' { $encoding = $null; $description = 'Roxio Toast disc image file, also some .dmg-files begin with same bytes'; break; }
'^a1b2c3d4' { $encoding = $null; $description = 'Libpcap File Format (big-endian)'; break; }
'^bebafeca' { $encoding = $null; $description = 'Palm Desktop Calendar Archive'; break; }
'^c9' { $encoding = $null; $description = 'CP/M 3 and higher with overlays'; break; }
'^cafebabe' { $encoding = $null; $description = 'Java class file, Mach-O Fat Binary'; break; }
'^cefaedfe' { $encoding = $null; $description = 'Mach-O binary (reverse byte ordering scheme, 32-bit)'; break; }
'^cf8401' { $encoding = $null; $description = 'Lepton compressed JPEG image'; break; }
'^cffaedfe' { $encoding = $null; $description = 'Mach-O binary (reverse byte ordering scheme, 64-bit)'; break; }
'^d0cf11e0' { $encoding = $null; $description = 'Office Document'; break; }
'^d4c3b2a1' { $encoding = $null; $description = 'Libpcap File Format (little-endian)'; break; }
'^d7cdc69a' { $encoding = $null; $description = 'Windows Meta File'; break; }
'^edabeedb' { $encoding = $null; $description = 'RedHat Package Manager (RPM) package'; break; }
'^fd377a58' { $encoding = $null; $description = 'XZ compression utility using LZMA2 compression'; break; }
'^feedfeed' { $encoding = $null; $description = 'JKS JavakeyStore'; break; }
'^ffd8' { $encoding = $null; $description = 'jpg'; break; }
'^fff2' { $encoding = $null; $description = 'MPEG-1 Layer 3 file'; break; }
'^fff3' { $encoding = $null; $description = 'MPEG-1 Layer 3 file'; break; }
'^fffb' { $encoding = $null; $description = 'MPEG-1 Layer 3 file'; break; }
}
if ($null -eq $description) {
if ($item.Length -lt $MaxFileLength) {
$bytes = [byte[]](Get-Content $Path -Encoding Byte -Raw)
}
$isAscii = $true
$byteCounter = 0
foreach($byte in $bytes) {
$byteCounter += 1
$convertedByte = [byte]([char]$byte)
# If it is a tab (9), LF (10), CR (13) or alphanumeric+symbols (32-126), it's valid
$whitespaceChars = $convertedByte -eq 9 -or $convertedByte -eq 10 -or $convertedByte -eq 13
if ($whitespaceChars) {
continue
}
if ($convertedByte -gt 126 -or $convertedByte -lt 32) {
# Byte is outside the typical range of bytes for ASCII text
$isAscii = $false
$description = "File appears to contain non-ASCII characters (current byte value: $convertedByte at counter: $byteCounter)"
break
}
}
if ($isAscii) {
$utf7TestResult = $false
# You can't know if something is UTF7 without reading it entirely in and trying to parse as UTF7.
# UTF7 is literally "encode Unicode as ASCII" much as base64 encoding might
if ($TestForUTF7) {
Write-Host "$logLead : Testing for UTF7"
$asciiTextRaw = (Get-Content -Path $Path -Encoding ASCII -Raw)
$utf7TextRaw = (Get-Content -Path $Path -Encoding UTF7 -Raw)
if ($asciiTextRaw -ne $utf7TextRaw) {
$encoding = [System.Text.Encoding]::UTF7
$description = "File appears to be UTF7 encoded"
$utf7TestResult = $true
}
}
if (!$utf7TestResult) {
$encoding = [System.Text.Encoding]::ASCII
$description = "File appears to only contain ASCII characters"
}
}
}
} else {
$description = "Zero-byte file, can not determine encoding"
}
$returns += @(@{File = $Path.Path; Encoding = $encoding; Description = $description; })
}
return $returns
}