ps/Modules/Cole.PowerShell.Developer/Public/Read-Xml.ps1

204 lines
8.6 KiB
PowerShell
Raw Normal View History

2023-05-30 22:51:22 -07:00
function Read-Xml {
param (
[Parameter(Mandatory = $false)]
[ValidateNotNullOrEmpty()]
[string]$Path
)
$logLead = (Get-LogLeadName)
if (!(Test-Path -Path $Path)) {
throw "$logLead : Not able to resolve [$Path]"
}
function parseTag {
param (
[string]$tagContent
)
# content should look like
# tagName attribute-name="attribute-value"
# or
# tagName attribute-name='attribute-value'
# but because we aren't assholes, we can use both ' and " and just look for the next matching non-escaped one
$charEscapedSingleQuote = [char]([byte]17)
$charEscapedDoubleQuote = [char]([byte]18)
$charEscapedStringSpace = [char]([byte]19)
$tagContent = $tagContent.Replace('\"',$charEscapedDoubleQuote).Replace("\'",$charEscapedSingleQuote)
$tagContentBytes = [System.Text.Encoding]::UTF8.GetBytes($tagContent)
$node = @{ Name = ''; }
$tagContentLength = $tagContent.Length
$currentToken = ''
$inString = $false
$inSingleString = $false
$inDoubleString = $false
for($i = 0; $i -lt $tagContentLength; $i++) {
$char = $tagContent[$i]
if ($inString) {
if (($char -eq '"') -and $inDoubleString) {
# we found the end of the string
$inString = $false
$inDoubleString = $false
} elseif (($char -eq "'") -and $inSingleString) {
# we found the end of the string
$inString = $false
$inSingleString = $false
} elseif ($char -eq ' ') {
# replace the space so we can quickly token parse our strings
$tagContentBytes[$i] = 19 # this matches $charEscapedStringSpace above
} elseif ($char -eq '"') {
$tagContentBytes[$i] = 18 # this matches $charEscapedDoubleQuote above
} elseif ($char -eq "'") {
$tagContentBytes[$i] = 17 # this matches $charEscapedSingleQuote above
}
} elseif ($char -eq '"') {
$inString = $true
$inDoubleString = $true
} elseif ($char -eq "'") {
$inString = $true
$inSingleString = $true
}
}
$tagContent = [System.Text.Encoding]::UTF8.GetString($tagContentBytes)
# Now handle the case of <tagName attribute = "escaped-string-content">
# It should be <tagName attribute="escaped-string-content">
while (($tagContent.IndexOf(' =') -gt -1) -or ($tagContent.IndexOf('= ') -gt -1)) {
$tagContent = $tagContent.Replace(' =','=').Replace('= ','=')
}
# now $tagContent has been escaped, so we can split on spaces, then equals, then remove quotes
$splits = $tagContent -split ' '
$splitCount = $splits.Count
$node.Name = $splits[0]
if ($splitCount -gt 1) {
$node.Attributes = @{}
}
for ($i = 1; $i -le $splitCount; $i++) {
$attributeRawValue = $splits[$i]
if ([string]::IsNullOrWhiteSpace($attributeRawValue)) {
# can't parse empty spaces, sadly :D
continue
}
$attributeEqualsIndex = $attributeRawValue.IndexOf('=')
$name = $attributeRawValue
$value = $attributeRawValue
if ($attributeEqualsIndex -eq -1) {
# the attribute stands alone, so we set it equal to itself (above)
} else {
$name = $attributeRawValue.Substring(0, $attributeEqualsIndex)
$value = $attributeRawValue.Substring($attributeEqualsIndex + 1).Replace('"','').Replace("'","").Replace($charEscapedStringSpace,' ').Replace($charEscapedDoubleQuote,'"').Replace($charEscapedSingleQuote,"'")
}
$node.Attributes.$name = $value
}
return $node.Name, $node.Attributes
}
$rawcontent = (Get-Content -Raw -Path $Path)
function parseNodes {
param (
$content
)
$parsedElements = @{}
$currentTag = $null
$beginTag = $false
$contentLength = $content.Length
for ($i = 0; $i -lt $contentLength; $i++) {
$char = $content[$i]
if ($char -eq '<') {
if ($content[$i+1] -eq '?') {
# We are in the xml chunk
$skipTo = $content.IndexOf('?>',$i+1)
$i = $skipTo + 1
continue
} elseif ($content[$i+1] -eq '!') {
# check if we are in CDATA mode, so we can skip to the end with the content in our node
Write-Host $content.Substring($i+1,7)
if ($content.Substring($i+1,7) -eq "!CDATA[") {
# We are in a CDATA chunk and can skip ahead to the end of it which is the next occurrence of ]]>
# We assume that only someone who truly hates us would do a nested CDATA block, cos of our limited scope of audience
# In a full fledged parser we would use a stack to track that we were in X
$skipTo = $content.IndexOf(']]>',$i+1)
$i = $skipTo + 1
}
} else {
$beginTag = $true
$currentTag = ''
}
} elseif ($char -eq '>') {
$beginTag = $false
if ($currentTag.Length -eq 0) {
throw "$logLead : Found an empty or invalid tag at [$i]"
}
$isSelfClosing = $false
if ($currentTag.EndsWith('/')) {
$isSelfClosing = $true
$currentTag = $currentTag.Substring(0,$currentTag.Length - 1)
}
$parsedTag,$nodeSet = (parseTag $currentTag)
$foundNodes = $null
$innerText = $null
# we hit the close tag, so let's find the end-tag of our current tag, unless the previous character was a / (thus forming /> or a self-closing tag)
if ($isSelfClosing) {
# don't look for the end-tag
} else {
# look for the end-tag
$closingTag = "</$($parsedTag)>"
$closingTagIndex = $content.IndexOf($closingTag,$i)
if ($closingTagIndex -eq -1) {
Write-Host $content.Substring($i)
throw "$logLead : Couldn't find a closing tag for [$($parsedTag)] starting at or around [$i]"
}
$endIndex = $closingTagIndex + $closingTag.Length
$innerContent = $content.Substring($i + 1,$closingTagIndex - $i - 1)
if ($innerContent.IndexOf("<$($parsedTag)") -gt -1) {
# We have a case of a recursive tag, where we contain ourselves, so we need to skip past nested same-as-self tags
# Ugh, what a disaster of an edge-case
$lastIndexOfSameTag = $content.LastIndexOf("<$($parsedTag)")
# Now find the next index of the closing tag from here
# Then find the next index of the closing tag from _that_ place
$closingTagIndex = $content.IndexOf($closingTag,$lastIndexOfSameTag + 1)
$newClosingTagIndex = $content.IndexOf($closingTag,$closingTagIndex + 1)
$endIndex = $newClosingTagIndex + $closingTag.Length
$innerContent = $content.Substring($i + 1,$newClosingTagIndex - $i - 1)
}
if (![string]::IsNullOrWhiteSpace($innerContent)) {
if ($innerContent.IndexOf('<') -gt -1) {
$childNodes = parseNodes($innerContent)
if ($null -ne $childNodes) {
$foundNodes = $childNodes
}
} else {
$innerText = @{ Text = $innerContent}
}
}
$i = $endIndex + 1
}
$parsedElements.$parsedTag = Merge-Objects -Objects $parsedElements.$parsedTag,$nodeSet,$foundNodes,$innerText -DontClobber -DontDeepMerge
} elseif ($beginTag) {
$currentTag += $char
}
}
return $parsedElements
}
$capture = parseNodes($rawcontent)
return $capture
}