From 75a294993c47ebd523f69521955d15cc502793b3 Mon Sep 17 00:00:00 2001 From: Dan Travison Date: Wed, 16 Aug 2017 16:25:16 -0700 Subject: [PATCH] In Web CmdLets, Use HTML meta charset attribute value, if present (#4338) * Use HTML meta charset attribute value, if present, when the Context-Type header does not specify it. --- .../BasicHtmlWebResponseObject.Common.cs | 14 +- .../CoreCLR/HtmlWebResponseObject.CoreClr.cs | 33 +++ .../InvokeRestMethodCommand.CoreClr.cs | 2 +- .../utility/WebCmdlet/StreamHelper.cs | 84 ++++-- .../WebCmdlets.Tests.ps1 | 271 +++++++++++++++++- 5 files changed, 381 insertions(+), 23 deletions(-) diff --git a/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/Common/BasicHtmlWebResponseObject.Common.cs b/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/Common/BasicHtmlWebResponseObject.Common.cs index 7f17c24f0..ae91ff001 100644 --- a/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/Common/BasicHtmlWebResponseObject.Common.cs +++ b/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/Common/BasicHtmlWebResponseObject.Common.cs @@ -25,6 +25,14 @@ namespace Microsoft.PowerShell.Commands /// public new string Content { get; private set; } + /// + /// Gets the Encoding that was used to decode the Content + /// + /// + /// The Encoding used to decode the Content; otherwise, a null reference if the content is not text. + /// + public Encoding Encoding { get; private set; } + private WebCmdletElementCollection _inputFields; /// @@ -217,14 +225,16 @@ namespace Microsoft.PowerShell.Commands /// /// Reads the response content from the web response. /// - private void InitializeContent() + protected void InitializeContent() { string contentType = ContentHelper.GetContentType(BaseResponse); if (ContentHelper.IsText(contentType)) { + Encoding encoding = null; // fill the Content buffer string characterSet = WebResponseHelper.GetCharacterSet(BaseResponse); - this.Content = StreamHelper.DecodeStream(RawContentStream, characterSet); + this.Content = StreamHelper.DecodeStream(RawContentStream, characterSet, out encoding); + this.Encoding = encoding; } else { diff --git a/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/CoreCLR/HtmlWebResponseObject.CoreClr.cs b/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/CoreCLR/HtmlWebResponseObject.CoreClr.cs index 09879c6e3..c87d24a0d 100644 --- a/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/CoreCLR/HtmlWebResponseObject.CoreClr.cs +++ b/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/CoreCLR/HtmlWebResponseObject.CoreClr.cs @@ -49,8 +49,41 @@ namespace Microsoft.PowerShell.Commands #endregion Constructors + #region Properties + + /// + /// Gets the Encoding that was used to decode the Content + /// + /// + /// The Encoding used to decode the Content; otherwise, a null reference if the content is not text. + /// + public Encoding Encoding { get; private set; } + + #endregion Properties + #region Methods + // NOTE: Currently this code path is not enabled. + // See FillRequestStream in WebRequestPSCmdlet.CoreClr.cs and + // GetResponseObject in WebResponseObjectFactory.CoreClr.cs for details. + private void InitializeContent() + { + string contentType = ContentHelper.GetContentType(BaseResponse); + string content = null; + if (ContentHelper.IsText(contentType)) + { + Encoding encoding = null; + // fill the Content buffer + string characterSet = WebResponseHelper.GetCharacterSet(BaseResponse); + this.Content = StreamHelper.DecodeStream(RawContentStream, characterSet, out encoding); + this.Encoding = encoding; + } + else + { + this.Content = string.Empty; + } + } + private void InitializeRawContent(HttpResponseMessage baseResponse) { StringBuilder raw = ContentHelper.GetRawContentHeader(baseResponse); diff --git a/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/CoreCLR/InvokeRestMethodCommand.CoreClr.cs b/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/CoreCLR/InvokeRestMethodCommand.CoreClr.cs index 7d0d6c2d3..8be7c9834 100644 --- a/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/CoreCLR/InvokeRestMethodCommand.CoreClr.cs +++ b/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/CoreCLR/InvokeRestMethodCommand.CoreClr.cs @@ -51,7 +51,7 @@ namespace Microsoft.PowerShell.Commands object obj = null; Exception ex = null; - string str = StreamHelper.DecodeStream(responseStream, encoding); + string str = StreamHelper.DecodeStream(responseStream, ref encoding); bool convertSuccess = false; // On CoreCLR, we need to explicitly load Json.NET diff --git a/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/StreamHelper.cs b/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/StreamHelper.cs index 0c905c2f9..cc4f0cacf 100644 --- a/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/StreamHelper.cs +++ b/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/StreamHelper.cs @@ -4,6 +4,7 @@ Copyright (c) Microsoft Corporation. All rights reserved. using System; using System.Text; +using System.Text.RegularExpressions; using System.IO; using System.IO.Compression; using System.Management.Automation; @@ -391,20 +392,8 @@ namespace Microsoft.PowerShell.Commands } } - internal static string DecodeStream(Stream stream, string characterSet) + private static string StreamToString(Stream stream, Encoding encoding) { - Encoding encoding = ContentHelper.GetEncodingOrDefault(characterSet); - return DecodeStream(stream, encoding); - } - - internal static string DecodeStream(Stream stream, Encoding encoding) - { - if (null == encoding) - { - // just use the default encoding if one wasn't provided - encoding = ContentHelper.GetDefaultEncoding(); - } - StringBuilder result = new StringBuilder(capacity: ChunkSize); Decoder decoder = encoding.GetDecoder(); @@ -413,9 +402,8 @@ namespace Microsoft.PowerShell.Commands { useBufferSize = encoding.GetMaxCharCount(10); } + char[] chars = new char[useBufferSize]; - - byte[] bytes = new byte[useBufferSize * 4]; int bytesRead = 0; do @@ -444,12 +432,74 @@ namespace Microsoft.PowerShell.Commands // Increment byteIndex to the next block of bytes in the input buffer, if any, to convert. byteIndex += bytesUsed; } - } - while (bytesRead != 0); + } while (bytesRead != 0); return result.ToString(); } + internal static string DecodeStream(Stream stream, string characterSet, out Encoding encoding) + { + try + { + encoding = Encoding.GetEncoding(characterSet); + } + catch (ArgumentException) + { + encoding = null; + } + return DecodeStream(stream, ref encoding); + } + + static bool TryGetEncoding(string characterSet, out Encoding encoding) + { + bool result = false; + try + { + encoding = Encoding.GetEncoding(characterSet); + result = true; + } + catch (ArgumentException) + { + encoding = null; + } + return result; + } + + static readonly Regex s_metaexp = new Regex(@"<]*charset\s*=\s*[""'\n]?(?[A-Za-z].[^\s""'\n<>]*)[\s""'\n>]"); + + internal static string DecodeStream(Stream stream, ref Encoding encoding) + { + bool isDefaultEncoding = false; + if (null == encoding) + { + // Use the default encoding if one wasn't provided + encoding = ContentHelper.GetDefaultEncoding(); + isDefaultEncoding = true; + } + + string content = StreamToString (stream, encoding); + if (isDefaultEncoding) do + { + // check for a charset attribute on the meta element to override the default. + Match match = s_metaexp.Match(content); + if (match.Success) + { + Encoding localEncoding = null; + string characterSet = match.Groups["charset"].Value; + + if (TryGetEncoding(characterSet, out localEncoding)) + { + stream.Seek(0, SeekOrigin.Begin); + content = StreamToString(stream, localEncoding); + // report the encoding used. + encoding = localEncoding; + } + } + } while (false); + + return content; + } + internal static Byte[] EncodeToBytes(String str, Encoding encoding) { if (null == encoding) diff --git a/test/powershell/Modules/Microsoft.PowerShell.Utility/WebCmdlets.Tests.ps1 b/test/powershell/Modules/Microsoft.PowerShell.Utility/WebCmdlets.Tests.ps1 index b2fdd05ae..638f94f1d 100644 --- a/test/powershell/Modules/Microsoft.PowerShell.Utility/WebCmdlets.Tests.ps1 +++ b/test/powershell/Modules/Microsoft.PowerShell.Utility/WebCmdlets.Tests.ps1 @@ -248,9 +248,9 @@ function ExecuteRequestWithCustomUserAgent { try { $Params = @{ - Uri = $Uri - TimeoutSec = 5 - UserAgent = $UserAgent + Uri = $Uri + TimeoutSec = 5 + UserAgent = $UserAgent SkipHeaderValidation = $SkipHeaderValidation.IsPresent } if ($Cmdlet -eq 'Invoke-WebRequest') { @@ -271,6 +271,32 @@ function ExecuteRequestWithCustomUserAgent { return $result } +# This function calls Invoke-WebRequest with the given uri +function ExecuteWebRequest +{ + param ( + [Parameter(Mandatory)] + [string] + $Uri, + + [switch] $UseBasicParsing + ) + $result = [PSObject]@{Output = $null; Error = $null; Content = $null} + + try + { + $result.Output = Invoke-WebRequest -Uri $Uri -TimeoutSec 5 -UseBasicParsing:$UseBasicParsing.IsPresent + $result.Content = $result.Output.Content + } + catch + { + $result.Error = $_ + } + + return $result +} + + <# Defines the list of redirect codes to test as well as the expected Method when the redirection is handled. @@ -805,6 +831,245 @@ Describe "Invoke-WebRequest tests" -Tags "Feature" { #endregion SkipHeaderVerification Tests + #region charset encoding tests + + Context "BasicHtmlWebResponseObject Encoding tests" { + It "Verifies Invoke-WebRequest detects charset meta value when the ContentType header does not define it." { + $output = '' + $expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest detects charset meta value when newlines are encountered in the element." { + $output = @' + + + + + +'@ + $expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest detects charset meta value when the attribute value is unquoted." { + $output = '' + $expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest detects http-equiv charset meta value when the ContentType header does not define it." { + $output = @' + + + + +'@ + $expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest detects http-equiv charset meta value newlines are encountered in the element." { + $output = @' + + + + +'@ + $expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest ignores meta charset value when Content-Type header defines it." { + $output = '' + # NOTE: meta charset should be ignored + $expectedEncoding = [System.Text.Encoding]::UTF8 + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-8&output=$output" -UseBasicParsing + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest honors non-utf8 charsets in the Content-Type header" { + $output = '' + # NOTE: meta charset should be ignored + $expectedEncoding = [System.Text.Encoding]::GetEncoding('utf-16') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-16&output=$output" -UseBasicParsing + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared" { + $output = '' + $expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output" -UseBasicParsing + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared using http-equiv" { + $output = @' + + + + +'@ + $expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output" -UseBasicParsing + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject' + } + } + + Context "HtmlWebResponseObject Encoding" { + # these tests are dependent on https://github.com/PowerShell/PowerShell/issues/2867 + # Currently, all paths return BasicHtmlWebResponseObject + It "Verifies Invoke-WebRequest detects charset meta value when the ContentType header does not define it." -Pending { + $output = '' + $expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + # Update to test for HtmlWebResponseObject when mshtl dependency has been resolved. + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest detects charset meta value when newlines are encountered in the element." -Pending { + $output = @' + + + + + +'@ + $expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest ignores meta charset value when Content-Type header defines it." -Pending { + $output = '' + # NOTE: meta charset should be ignored + $expectedEncoding = [System.Text.Encoding]::UTF8 + # Update to test for HtmlWebResponseObject when mshtl dependency has been resolved. + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-8&output=$output" + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + # Update to test for HtmlWebResponseObject when mshtl dependency has been resolved. + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest detects http-equiv charset meta value when the ContentType header does not define it." -Pending { + $output = @' + + + + +'@ + $expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest detects http-equiv charset meta value newlines are encountered in the element." -Pending { + $output = @' + + + + +'@ + $expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest honors non-utf8 charsets in the Content-Type header" -Pending { + $output = '' + # NOTE: meta charset should be ignored + $expectedEncoding = [System.Text.Encoding]::GetEncoding('utf-16') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-16&output=$output" + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + # Update to test for HtmlWebResponseObject when mshtl dependency has been resolved. + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared" -Pending { + $output = '' + $expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output" + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + # Update to test for HtmlWebResponseObject when mshtl dependency has been resolved. + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject' + } + + It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared using http-equiv" -Pending { + $output = @' + + + + +'@ + $expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1') + $response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output" + + $response.Error | Should BeNullOrEmpty + $response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName + $response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject' + } + } + + #endregion charset encoding tests + BeforeEach { if ($env:http_proxy) { $savedHttpProxy = $env:http_proxy