In Web CmdLets, Use HTML meta charset attribute value, if present (#4338)

* Use HTML meta charset attribute value, if present, when the Context-Type header does not specify it.
This commit is contained in:
Dan Travison 2017-08-16 16:25:16 -07:00 committed by Travis Plunk
parent c699e73ebe
commit 75a294993c
5 changed files with 381 additions and 23 deletions

View file

@ -25,6 +25,14 @@ namespace Microsoft.PowerShell.Commands
/// </summary> /// </summary>
public new string Content { get; private set; } public new string Content { get; private set; }
/// <summary>
/// Gets the Encoding that was used to decode the Content
/// </summary>
/// <value>
/// The Encoding used to decode the Content; otherwise, a null reference if the content is not text.
/// </value>
public Encoding Encoding { get; private set; }
private WebCmdletElementCollection _inputFields; private WebCmdletElementCollection _inputFields;
/// <summary> /// <summary>
@ -217,14 +225,16 @@ namespace Microsoft.PowerShell.Commands
/// <summary> /// <summary>
/// Reads the response content from the web response. /// Reads the response content from the web response.
/// </summary> /// </summary>
private void InitializeContent() protected void InitializeContent()
{ {
string contentType = ContentHelper.GetContentType(BaseResponse); string contentType = ContentHelper.GetContentType(BaseResponse);
if (ContentHelper.IsText(contentType)) if (ContentHelper.IsText(contentType))
{ {
Encoding encoding = null;
// fill the Content buffer // fill the Content buffer
string characterSet = WebResponseHelper.GetCharacterSet(BaseResponse); string characterSet = WebResponseHelper.GetCharacterSet(BaseResponse);
this.Content = StreamHelper.DecodeStream(RawContentStream, characterSet); this.Content = StreamHelper.DecodeStream(RawContentStream, characterSet, out encoding);
this.Encoding = encoding;
} }
else else
{ {

View file

@ -49,8 +49,41 @@ namespace Microsoft.PowerShell.Commands
#endregion Constructors #endregion Constructors
#region Properties
/// <summary>
/// Gets the Encoding that was used to decode the Content
/// </summary>
/// <value>
/// The Encoding used to decode the Content; otherwise, a null reference if the content is not text.
/// </value>
public Encoding Encoding { get; private set; }
#endregion Properties
#region Methods #region Methods
// NOTE: Currently this code path is not enabled.
// See FillRequestStream in WebRequestPSCmdlet.CoreClr.cs and
// GetResponseObject in WebResponseObjectFactory.CoreClr.cs for details.
private void InitializeContent()
{
string contentType = ContentHelper.GetContentType(BaseResponse);
string content = null;
if (ContentHelper.IsText(contentType))
{
Encoding encoding = null;
// fill the Content buffer
string characterSet = WebResponseHelper.GetCharacterSet(BaseResponse);
this.Content = StreamHelper.DecodeStream(RawContentStream, characterSet, out encoding);
this.Encoding = encoding;
}
else
{
this.Content = string.Empty;
}
}
private void InitializeRawContent(HttpResponseMessage baseResponse) private void InitializeRawContent(HttpResponseMessage baseResponse)
{ {
StringBuilder raw = ContentHelper.GetRawContentHeader(baseResponse); StringBuilder raw = ContentHelper.GetRawContentHeader(baseResponse);

View file

@ -51,7 +51,7 @@ namespace Microsoft.PowerShell.Commands
object obj = null; object obj = null;
Exception ex = null; Exception ex = null;
string str = StreamHelper.DecodeStream(responseStream, encoding); string str = StreamHelper.DecodeStream(responseStream, ref encoding);
bool convertSuccess = false; bool convertSuccess = false;
// On CoreCLR, we need to explicitly load Json.NET // On CoreCLR, we need to explicitly load Json.NET

View file

@ -4,6 +4,7 @@ Copyright (c) Microsoft Corporation. All rights reserved.
using System; using System;
using System.Text; using System.Text;
using System.Text.RegularExpressions;
using System.IO; using System.IO;
using System.IO.Compression; using System.IO.Compression;
using System.Management.Automation; using System.Management.Automation;
@ -391,20 +392,8 @@ namespace Microsoft.PowerShell.Commands
} }
} }
internal static string DecodeStream(Stream stream, string characterSet) private static string StreamToString(Stream stream, Encoding encoding)
{ {
Encoding encoding = ContentHelper.GetEncodingOrDefault(characterSet);
return DecodeStream(stream, encoding);
}
internal static string DecodeStream(Stream stream, Encoding encoding)
{
if (null == encoding)
{
// just use the default encoding if one wasn't provided
encoding = ContentHelper.GetDefaultEncoding();
}
StringBuilder result = new StringBuilder(capacity: ChunkSize); StringBuilder result = new StringBuilder(capacity: ChunkSize);
Decoder decoder = encoding.GetDecoder(); Decoder decoder = encoding.GetDecoder();
@ -413,9 +402,8 @@ namespace Microsoft.PowerShell.Commands
{ {
useBufferSize = encoding.GetMaxCharCount(10); useBufferSize = encoding.GetMaxCharCount(10);
} }
char[] chars = new char[useBufferSize]; char[] chars = new char[useBufferSize];
byte[] bytes = new byte[useBufferSize * 4]; byte[] bytes = new byte[useBufferSize * 4];
int bytesRead = 0; int bytesRead = 0;
do do
@ -444,12 +432,74 @@ namespace Microsoft.PowerShell.Commands
// Increment byteIndex to the next block of bytes in the input buffer, if any, to convert. // Increment byteIndex to the next block of bytes in the input buffer, if any, to convert.
byteIndex += bytesUsed; byteIndex += bytesUsed;
} }
} } while (bytesRead != 0);
while (bytesRead != 0);
return result.ToString(); return result.ToString();
} }
internal static string DecodeStream(Stream stream, string characterSet, out Encoding encoding)
{
try
{
encoding = Encoding.GetEncoding(characterSet);
}
catch (ArgumentException)
{
encoding = null;
}
return DecodeStream(stream, ref encoding);
}
static bool TryGetEncoding(string characterSet, out Encoding encoding)
{
bool result = false;
try
{
encoding = Encoding.GetEncoding(characterSet);
result = true;
}
catch (ArgumentException)
{
encoding = null;
}
return result;
}
static readonly Regex s_metaexp = new Regex(@"<meta\s[.\n]*[^><]*charset\s*=\s*[""'\n]?(?<charset>[A-Za-z].[^\s""'\n<>]*)[\s""'\n>]");
internal static string DecodeStream(Stream stream, ref Encoding encoding)
{
bool isDefaultEncoding = false;
if (null == encoding)
{
// Use the default encoding if one wasn't provided
encoding = ContentHelper.GetDefaultEncoding();
isDefaultEncoding = true;
}
string content = StreamToString (stream, encoding);
if (isDefaultEncoding) do
{
// check for a charset attribute on the meta element to override the default.
Match match = s_metaexp.Match(content);
if (match.Success)
{
Encoding localEncoding = null;
string characterSet = match.Groups["charset"].Value;
if (TryGetEncoding(characterSet, out localEncoding))
{
stream.Seek(0, SeekOrigin.Begin);
content = StreamToString(stream, localEncoding);
// report the encoding used.
encoding = localEncoding;
}
}
} while (false);
return content;
}
internal static Byte[] EncodeToBytes(String str, Encoding encoding) internal static Byte[] EncodeToBytes(String str, Encoding encoding)
{ {
if (null == encoding) if (null == encoding)

View file

@ -271,6 +271,32 @@ function ExecuteRequestWithCustomUserAgent {
return $result return $result
} }
# This function calls Invoke-WebRequest with the given uri
function ExecuteWebRequest
{
param (
[Parameter(Mandatory)]
[string]
$Uri,
[switch] $UseBasicParsing
)
$result = [PSObject]@{Output = $null; Error = $null; Content = $null}
try
{
$result.Output = Invoke-WebRequest -Uri $Uri -TimeoutSec 5 -UseBasicParsing:$UseBasicParsing.IsPresent
$result.Content = $result.Output.Content
}
catch
{
$result.Error = $_
}
return $result
}
<# <#
Defines the list of redirect codes to test as well as the Defines the list of redirect codes to test as well as the
expected Method when the redirection is handled. expected Method when the redirection is handled.
@ -805,6 +831,245 @@ Describe "Invoke-WebRequest tests" -Tags "Feature" {
#endregion SkipHeaderVerification Tests #endregion SkipHeaderVerification Tests
#region charset encoding tests
Context "BasicHtmlWebResponseObject Encoding tests" {
It "Verifies Invoke-WebRequest detects charset meta value when the ContentType header does not define it." {
$output = '<html><head><meta charset="Unicode"></head></html>'
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects charset meta value when newlines are encountered in the element." {
$output = @'
<html>
<head>
<meta
charset="Unicode"
>
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects charset meta value when the attribute value is unquoted." {
$output = '<html><head><meta charset = Unicode></head></html>'
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects http-equiv charset meta value when the ContentType header does not define it." {
$output = @'
<html><head>
<meta http-equiv="content-type" content="text/html; charset=Unicode">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects http-equiv charset meta value newlines are encountered in the element." {
$output = @'
<html><head>
<meta
http-equiv="content-type"
content="text/html; charset=Unicode">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest ignores meta charset value when Content-Type header defines it." {
$output = '<html><head><meta charset="utf-32"></head></html>'
# NOTE: meta charset should be ignored
$expectedEncoding = [System.Text.Encoding]::UTF8
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-8&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest honors non-utf8 charsets in the Content-Type header" {
$output = '<html><head><meta charset="utf-32"></head></html>'
# NOTE: meta charset should be ignored
$expectedEncoding = [System.Text.Encoding]::GetEncoding('utf-16')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-16&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared" {
$output = '<html><head><meta charset="invalid"></head></html>'
$expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared using http-equiv" {
$output = @'
<html><head>
<meta http-equiv="content-type" content="text/html; charset=Invalid">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
}
Context "HtmlWebResponseObject Encoding" {
# these tests are dependent on https://github.com/PowerShell/PowerShell/issues/2867
# Currently, all paths return BasicHtmlWebResponseObject
It "Verifies Invoke-WebRequest detects charset meta value when the ContentType header does not define it." -Pending {
$output = '<html><head><meta charset="Unicode"></head></html>'
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
# Update to test for HtmlWebResponseObject when mshtl dependency has been resolved.
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects charset meta value when newlines are encountered in the element." -Pending {
$output = @'
<html>
<head>
<meta
charset="Unicode"
>
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest ignores meta charset value when Content-Type header defines it." -Pending {
$output = '<html><head><meta charset="utf-16"></head></html>'
# NOTE: meta charset should be ignored
$expectedEncoding = [System.Text.Encoding]::UTF8
# Update to test for HtmlWebResponseObject when mshtl dependency has been resolved.
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-8&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
# Update to test for HtmlWebResponseObject when mshtl dependency has been resolved.
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects http-equiv charset meta value when the ContentType header does not define it." -Pending {
$output = @'
<html><head>
<meta http-equiv="content-type" content="text/html; charset=Unicode">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects http-equiv charset meta value newlines are encountered in the element." -Pending {
$output = @'
<html><head>
<meta
http-equiv="content-type"
content="text/html; charset=Unicode">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest honors non-utf8 charsets in the Content-Type header" -Pending {
$output = '<html><head><meta charset="utf-32"></head></html>'
# NOTE: meta charset should be ignored
$expectedEncoding = [System.Text.Encoding]::GetEncoding('utf-16')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-16&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
# Update to test for HtmlWebResponseObject when mshtl dependency has been resolved.
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared" -Pending {
$output = '<html><head><meta charset="invalid"></head></html>'
$expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
# Update to test for HtmlWebResponseObject when mshtl dependency has been resolved.
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared using http-equiv" -Pending {
$output = @'
<html><head>
<meta http-equiv="content-type" content="text/html; charset=Invalid">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
}
#endregion charset encoding tests
BeforeEach { BeforeEach {
if ($env:http_proxy) { if ($env:http_proxy) {
$savedHttpProxy = $env:http_proxy $savedHttpProxy = $env:http_proxy