In Web CmdLets, Use HTML meta charset attribute value, if present (#4338)

* Use HTML meta charset attribute value, if present, when the Context-Type header does not specify it.
This commit is contained in:
Dan Travison 2017-08-16 16:25:16 -07:00 committed by Travis Plunk
parent c699e73ebe
commit 75a294993c
5 changed files with 381 additions and 23 deletions

View file

@ -25,6 +25,14 @@ namespace Microsoft.PowerShell.Commands
/// </summary>
public new string Content { get; private set; }
/// <summary>
/// Gets the Encoding that was used to decode the Content
/// </summary>
/// <value>
/// The Encoding used to decode the Content; otherwise, a null reference if the content is not text.
/// </value>
public Encoding Encoding { get; private set; }
private WebCmdletElementCollection _inputFields;
/// <summary>
@ -217,14 +225,16 @@ namespace Microsoft.PowerShell.Commands
/// <summary>
/// Reads the response content from the web response.
/// </summary>
private void InitializeContent()
protected void InitializeContent()
{
string contentType = ContentHelper.GetContentType(BaseResponse);
if (ContentHelper.IsText(contentType))
{
Encoding encoding = null;
// fill the Content buffer
string characterSet = WebResponseHelper.GetCharacterSet(BaseResponse);
this.Content = StreamHelper.DecodeStream(RawContentStream, characterSet);
this.Content = StreamHelper.DecodeStream(RawContentStream, characterSet, out encoding);
this.Encoding = encoding;
}
else
{

View file

@ -49,8 +49,41 @@ namespace Microsoft.PowerShell.Commands
#endregion Constructors
#region Properties
/// <summary>
/// Gets the Encoding that was used to decode the Content
/// </summary>
/// <value>
/// The Encoding used to decode the Content; otherwise, a null reference if the content is not text.
/// </value>
public Encoding Encoding { get; private set; }
#endregion Properties
#region Methods
// NOTE: Currently this code path is not enabled.
// See FillRequestStream in WebRequestPSCmdlet.CoreClr.cs and
// GetResponseObject in WebResponseObjectFactory.CoreClr.cs for details.
private void InitializeContent()
{
string contentType = ContentHelper.GetContentType(BaseResponse);
string content = null;
if (ContentHelper.IsText(contentType))
{
Encoding encoding = null;
// fill the Content buffer
string characterSet = WebResponseHelper.GetCharacterSet(BaseResponse);
this.Content = StreamHelper.DecodeStream(RawContentStream, characterSet, out encoding);
this.Encoding = encoding;
}
else
{
this.Content = string.Empty;
}
}
private void InitializeRawContent(HttpResponseMessage baseResponse)
{
StringBuilder raw = ContentHelper.GetRawContentHeader(baseResponse);

View file

@ -51,7 +51,7 @@ namespace Microsoft.PowerShell.Commands
object obj = null;
Exception ex = null;
string str = StreamHelper.DecodeStream(responseStream, encoding);
string str = StreamHelper.DecodeStream(responseStream, ref encoding);
bool convertSuccess = false;
// On CoreCLR, we need to explicitly load Json.NET

View file

@ -4,6 +4,7 @@ Copyright (c) Microsoft Corporation. All rights reserved.
using System;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using System.IO.Compression;
using System.Management.Automation;
@ -391,20 +392,8 @@ namespace Microsoft.PowerShell.Commands
}
}
internal static string DecodeStream(Stream stream, string characterSet)
private static string StreamToString(Stream stream, Encoding encoding)
{
Encoding encoding = ContentHelper.GetEncodingOrDefault(characterSet);
return DecodeStream(stream, encoding);
}
internal static string DecodeStream(Stream stream, Encoding encoding)
{
if (null == encoding)
{
// just use the default encoding if one wasn't provided
encoding = ContentHelper.GetDefaultEncoding();
}
StringBuilder result = new StringBuilder(capacity: ChunkSize);
Decoder decoder = encoding.GetDecoder();
@ -413,9 +402,8 @@ namespace Microsoft.PowerShell.Commands
{
useBufferSize = encoding.GetMaxCharCount(10);
}
char[] chars = new char[useBufferSize];
byte[] bytes = new byte[useBufferSize * 4];
int bytesRead = 0;
do
@ -444,12 +432,74 @@ namespace Microsoft.PowerShell.Commands
// Increment byteIndex to the next block of bytes in the input buffer, if any, to convert.
byteIndex += bytesUsed;
}
}
while (bytesRead != 0);
} while (bytesRead != 0);
return result.ToString();
}
internal static string DecodeStream(Stream stream, string characterSet, out Encoding encoding)
{
try
{
encoding = Encoding.GetEncoding(characterSet);
}
catch (ArgumentException)
{
encoding = null;
}
return DecodeStream(stream, ref encoding);
}
static bool TryGetEncoding(string characterSet, out Encoding encoding)
{
bool result = false;
try
{
encoding = Encoding.GetEncoding(characterSet);
result = true;
}
catch (ArgumentException)
{
encoding = null;
}
return result;
}
static readonly Regex s_metaexp = new Regex(@"<meta\s[.\n]*[^><]*charset\s*=\s*[""'\n]?(?<charset>[A-Za-z].[^\s""'\n<>]*)[\s""'\n>]");
internal static string DecodeStream(Stream stream, ref Encoding encoding)
{
bool isDefaultEncoding = false;
if (null == encoding)
{
// Use the default encoding if one wasn't provided
encoding = ContentHelper.GetDefaultEncoding();
isDefaultEncoding = true;
}
string content = StreamToString (stream, encoding);
if (isDefaultEncoding) do
{
// check for a charset attribute on the meta element to override the default.
Match match = s_metaexp.Match(content);
if (match.Success)
{
Encoding localEncoding = null;
string characterSet = match.Groups["charset"].Value;
if (TryGetEncoding(characterSet, out localEncoding))
{
stream.Seek(0, SeekOrigin.Begin);
content = StreamToString(stream, localEncoding);
// report the encoding used.
encoding = localEncoding;
}
}
} while (false);
return content;
}
internal static Byte[] EncodeToBytes(String str, Encoding encoding)
{
if (null == encoding)

View file

@ -248,9 +248,9 @@ function ExecuteRequestWithCustomUserAgent {
try {
$Params = @{
Uri = $Uri
TimeoutSec = 5
UserAgent = $UserAgent
Uri = $Uri
TimeoutSec = 5
UserAgent = $UserAgent
SkipHeaderValidation = $SkipHeaderValidation.IsPresent
}
if ($Cmdlet -eq 'Invoke-WebRequest') {
@ -271,6 +271,32 @@ function ExecuteRequestWithCustomUserAgent {
return $result
}
# This function calls Invoke-WebRequest with the given uri
function ExecuteWebRequest
{
param (
[Parameter(Mandatory)]
[string]
$Uri,
[switch] $UseBasicParsing
)
$result = [PSObject]@{Output = $null; Error = $null; Content = $null}
try
{
$result.Output = Invoke-WebRequest -Uri $Uri -TimeoutSec 5 -UseBasicParsing:$UseBasicParsing.IsPresent
$result.Content = $result.Output.Content
}
catch
{
$result.Error = $_
}
return $result
}
<#
Defines the list of redirect codes to test as well as the
expected Method when the redirection is handled.
@ -805,6 +831,245 @@ Describe "Invoke-WebRequest tests" -Tags "Feature" {
#endregion SkipHeaderVerification Tests
#region charset encoding tests
Context "BasicHtmlWebResponseObject Encoding tests" {
It "Verifies Invoke-WebRequest detects charset meta value when the ContentType header does not define it." {
$output = '<html><head><meta charset="Unicode"></head></html>'
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects charset meta value when newlines are encountered in the element." {
$output = @'
<html>
<head>
<meta
charset="Unicode"
>
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects charset meta value when the attribute value is unquoted." {
$output = '<html><head><meta charset = Unicode></head></html>'
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects http-equiv charset meta value when the ContentType header does not define it." {
$output = @'
<html><head>
<meta http-equiv="content-type" content="text/html; charset=Unicode">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects http-equiv charset meta value newlines are encountered in the element." {
$output = @'
<html><head>
<meta
http-equiv="content-type"
content="text/html; charset=Unicode">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest ignores meta charset value when Content-Type header defines it." {
$output = '<html><head><meta charset="utf-32"></head></html>'
# NOTE: meta charset should be ignored
$expectedEncoding = [System.Text.Encoding]::UTF8
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-8&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest honors non-utf8 charsets in the Content-Type header" {
$output = '<html><head><meta charset="utf-32"></head></html>'
# NOTE: meta charset should be ignored
$expectedEncoding = [System.Text.Encoding]::GetEncoding('utf-16')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-16&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared" {
$output = '<html><head><meta charset="invalid"></head></html>'
$expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared using http-equiv" {
$output = @'
<html><head>
<meta http-equiv="content-type" content="text/html; charset=Invalid">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output" -UseBasicParsing
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.BasicHtmlWebResponseObject'
}
}
Context "HtmlWebResponseObject Encoding" {
# these tests are dependent on https://github.com/PowerShell/PowerShell/issues/2867
# Currently, all paths return BasicHtmlWebResponseObject
It "Verifies Invoke-WebRequest detects charset meta value when the ContentType header does not define it." -Pending {
$output = '<html><head><meta charset="Unicode"></head></html>'
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
# Update to test for HtmlWebResponseObject when mshtl dependency has been resolved.
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects charset meta value when newlines are encountered in the element." -Pending {
$output = @'
<html>
<head>
<meta
charset="Unicode"
>
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest ignores meta charset value when Content-Type header defines it." -Pending {
$output = '<html><head><meta charset="utf-16"></head></html>'
# NOTE: meta charset should be ignored
$expectedEncoding = [System.Text.Encoding]::UTF8
# Update to test for HtmlWebResponseObject when mshtl dependency has been resolved.
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-8&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
# Update to test for HtmlWebResponseObject when mshtl dependency has been resolved.
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects http-equiv charset meta value when the ContentType header does not define it." -Pending {
$output = @'
<html><head>
<meta http-equiv="content-type" content="text/html; charset=Unicode">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest detects http-equiv charset meta value newlines are encountered in the element." -Pending {
$output = @'
<html><head>
<meta
http-equiv="content-type"
content="text/html; charset=Unicode">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('Unicode')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest honors non-utf8 charsets in the Content-Type header" -Pending {
$output = '<html><head><meta charset="utf-32"></head></html>'
# NOTE: meta charset should be ignored
$expectedEncoding = [System.Text.Encoding]::GetEncoding('utf-16')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html; charset=utf-16&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
# Update to test for HtmlWebResponseObject when mshtl dependency has been resolved.
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared" -Pending {
$output = '<html><head><meta charset="invalid"></head></html>'
$expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
# Update to test for HtmlWebResponseObject when mshtl dependency has been resolved.
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
It "Verifies Invoke-WebRequest defaults to iso-8859-1 when an unsupported/invalid charset is declared using http-equiv" -Pending {
$output = @'
<html><head>
<meta http-equiv="content-type" content="text/html; charset=Invalid">
</head>
</html>
'@
$expectedEncoding = [System.Text.Encoding]::GetEncoding('iso-8859-1')
$response = ExecuteWebRequest -Uri "http://localhost:8080/PowerShell?test=response&contenttype=text/html&output=$output"
$response.Error | Should BeNullOrEmpty
$response.Output.Encoding.EncodingName | Should Be $expectedEncoding.EncodingName
$response.Output | Should BeOfType 'Microsoft.PowerShell.Commands.HtmlWebResponseObject'
}
}
#endregion charset encoding tests
BeforeEach {
if ($env:http_proxy) {
$savedHttpProxy = $env:http_proxy