PowerShell/src/Microsoft.PowerShell.Commands.Utility/commands/utility/WebCmdlet/Common/BasicHtmlWebResponseObject.Common.cs
2020-03-24 10:51:55 -07:00

292 lines
10 KiB
C#

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Management.Automation;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.RegularExpressions;
namespace Microsoft.PowerShell.Commands
{
/// <summary>
/// Response object for html content without DOM parsing.
/// </summary>
public class BasicHtmlWebResponseObject : WebResponseObject
{
#region Private Fields
private static Regex s_attribNameValueRegex;
private static Regex s_attribsRegex;
private static Regex s_imageRegex;
private static Regex s_inputFieldRegex;
private static Regex s_linkRegex;
private static Regex s_tagRegex;
#endregion Private Fields
#region Constructors
/// <summary>
/// Constructor for BasicHtmlWebResponseObject.
/// </summary>
/// <param name="response"></param>
public BasicHtmlWebResponseObject(HttpResponseMessage response)
: this(response, null)
{ }
/// <summary>
/// Constructor for HtmlWebResponseObject with memory stream.
/// </summary>
/// <param name="response"></param>
/// <param name="contentStream"></param>
public BasicHtmlWebResponseObject(HttpResponseMessage response, Stream contentStream)
: base(response, contentStream)
{
EnsureHtmlParser();
InitializeContent();
InitializeRawContent(response);
}
#endregion Constructors
#region Properties
/// <summary>
/// Gets the text body content of this response.
/// </summary>
/// <value>
/// Content of the response body, decoded using <see cref="Encoding"/>,
/// if the <c>Content-Type</c> response header is a recognized text
/// type. Otherwise <c>null</c>.
/// </value>
public new string Content { get; private set; }
/// <summary>
/// Gets the encoding of the text body content of this response.
/// </summary>
/// <value>
/// Encoding of the response body from the <c>Content-Type</c> header,
/// or <c>null</c> if the encoding could not be determined.
/// </value>
public Encoding Encoding { get; private set; }
private WebCmdletElementCollection _inputFields;
/// <summary>
/// Gets the HTML input field elements parsed from <see cref="Content"/>.
/// </summary>
public WebCmdletElementCollection InputFields
{
get
{
if (_inputFields == null)
{
EnsureHtmlParser();
List<PSObject> parsedFields = new List<PSObject>();
MatchCollection fieldMatch = s_inputFieldRegex.Matches(Content);
foreach (Match field in fieldMatch)
{
parsedFields.Add(CreateHtmlObject(field.Value, "INPUT"));
}
_inputFields = new WebCmdletElementCollection(parsedFields);
}
return _inputFields;
}
}
private WebCmdletElementCollection _links;
/// <summary>
/// Gets the HTML a link elements parsed from <see cref="Content"/>.
/// </summary>
public WebCmdletElementCollection Links
{
get
{
if (_links == null)
{
EnsureHtmlParser();
List<PSObject> parsedLinks = new List<PSObject>();
MatchCollection linkMatch = s_linkRegex.Matches(Content);
foreach (Match link in linkMatch)
{
parsedLinks.Add(CreateHtmlObject(link.Value, "A"));
}
_links = new WebCmdletElementCollection(parsedLinks);
}
return _links;
}
}
private WebCmdletElementCollection _images;
/// <summary>
/// Gets the HTML img elements parsed from <see cref="Content"/>.
/// </summary>
public WebCmdletElementCollection Images
{
get
{
if (_images == null)
{
EnsureHtmlParser();
List<PSObject> parsedImages = new List<PSObject>();
MatchCollection imageMatch = s_imageRegex.Matches(Content);
foreach (Match image in imageMatch)
{
parsedImages.Add(CreateHtmlObject(image.Value, "IMG"));
}
_images = new WebCmdletElementCollection(parsedImages);
}
return _images;
}
}
#endregion Properties
#region Methods
/// <summary>
/// Reads the response content from the web response.
/// </summary>
protected void InitializeContent()
{
string contentType = ContentHelper.GetContentType(BaseResponse);
if (ContentHelper.IsText(contentType))
{
Encoding encoding = null;
// fill the Content buffer
string characterSet = WebResponseHelper.GetCharacterSet(BaseResponse);
if (string.IsNullOrEmpty(characterSet) && ContentHelper.IsJson(contentType))
{
characterSet = Encoding.UTF8.HeaderName;
}
this.Content = StreamHelper.DecodeStream(RawContentStream, characterSet, out encoding);
this.Encoding = encoding;
}
else
{
this.Content = string.Empty;
}
}
private PSObject CreateHtmlObject(string html, string tagName)
{
PSObject elementObject = new PSObject();
elementObject.Properties.Add(new PSNoteProperty("outerHTML", html));
elementObject.Properties.Add(new PSNoteProperty("tagName", tagName));
ParseAttributes(html, elementObject);
return elementObject;
}
private void EnsureHtmlParser()
{
if (s_tagRegex == null)
{
s_tagRegex = new Regex(@"<\w+((\s+[^""'>/=\s\p{Cc}]+(\s*=\s*(?:"".*?""|'.*?'|[^'"">\s]+))?)+\s*|\s*)/?>",
RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
if (s_attribsRegex == null)
{
s_attribsRegex = new Regex(@"(?<=\s+)([^""'>/=\s\p{Cc}]+(\s*=\s*(?:"".*?""|'.*?'|[^'"">\s]+))?)",
RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
if (s_attribNameValueRegex == null)
{
s_attribNameValueRegex = new Regex(@"([^""'>/=\s\p{Cc}]+)(?:\s*=\s*(?:""(.*?)""|'(.*?)'|([^'"">\s]+)))?",
RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
if (s_inputFieldRegex == null)
{
s_inputFieldRegex = new Regex(@"<input\s+[^>]*(/>|>.*?</input>)",
RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
if (s_linkRegex == null)
{
s_linkRegex = new Regex(@"<a\s+[^>]*(/>|>.*?</a>)",
RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
if (s_imageRegex == null)
{
s_imageRegex = new Regex(@"<img\s[^>]*?>",
RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
}
private void InitializeRawContent(HttpResponseMessage baseResponse)
{
StringBuilder raw = ContentHelper.GetRawContentHeader(baseResponse);
raw.Append(Content);
this.RawContent = raw.ToString();
}
private void ParseAttributes(string outerHtml, PSObject elementObject)
{
// We might get an empty input for a directive from the HTML file
if (!string.IsNullOrEmpty(outerHtml))
{
// Extract just the opening tag of the HTML element (omitting the closing tag and any contents,
// including contained HTML elements)
var match = s_tagRegex.Match(outerHtml);
// Extract all the attribute specifications within the HTML element opening tag
var attribMatches = s_attribsRegex.Matches(match.Value);
foreach (Match attribMatch in attribMatches)
{
// Extract the name and value for this attribute (allowing for variations like single/double/no
// quotes, and no value at all)
var nvMatches = s_attribNameValueRegex.Match(attribMatch.Value);
Debug.Assert(nvMatches.Groups.Count == 5);
// Name is always captured by group #1
string name = nvMatches.Groups[1].Value;
// The value (if any) is captured by group #2, #3, or #4, depending on quoting or lack thereof
string value = null;
if (nvMatches.Groups[2].Success)
{
value = nvMatches.Groups[2].Value;
}
else if (nvMatches.Groups[3].Success)
{
value = nvMatches.Groups[3].Value;
}
else if (nvMatches.Groups[4].Success)
{
value = nvMatches.Groups[4].Value;
}
elementObject.Properties.Add(new PSNoteProperty(name, value));
}
}
}
#endregion Methods
}
}