Reduce allocations in Get-Content cmdlet (#8103)
* Simplify main loop in ReadDelimited() by bringing out raw stream code * Remove unused constructor * Allocate buffer for current line once in constructor * Use Span buffer to significantly reduce allocations * Optimize Boyer-Moore. Use simple mapping Unicode chars to byte. It works well for common text files with chars from one-two languages.
This commit is contained in:
parent
ddaad12183
commit
0e19042848
|
@ -16,6 +16,7 @@
|
|||
<PackageReference Include="System.DirectoryServices" Version="4.5.0" />
|
||||
<PackageReference Include="System.IO.FileSystem.AccessControl" Version="4.5.0" />
|
||||
<PackageReference Include="System.Management" Version="4.5.0" />
|
||||
<PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="4.5.2" />
|
||||
<PackageReference Include="System.Security.AccessControl" Version="4.5.0" />
|
||||
<PackageReference Include="System.Security.Cryptography.Pkcs" Version="4.5.1" />
|
||||
<PackageReference Include="System.Security.Permissions" Version="4.5.0" />
|
||||
|
|
|
@ -6,11 +6,12 @@ using System.Collections;
|
|||
using System.Collections.Generic;
|
||||
using System.Diagnostics.CodeAnalysis;
|
||||
using System.IO;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
using System.Management.Automation;
|
||||
using System.Management.Automation.Internal;
|
||||
using System.Management.Automation.Provider;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using Dbg = System.Management.Automation;
|
||||
|
||||
|
@ -54,8 +55,11 @@ namespace Microsoft.PowerShell.Commands
|
|||
private StreamReader _reader;
|
||||
private StreamWriter _writer;
|
||||
private bool _usingByteEncoding;
|
||||
private string _delimiter = "\n";
|
||||
private const char DefaultDelimiter = '\n';
|
||||
private string _delimiter = $"{DefaultDelimiter}";
|
||||
private int[] _offsetDictionary;
|
||||
private bool _usingDelimiter;
|
||||
private StringBuilder _currentLineContent;
|
||||
private bool _waitForChanges;
|
||||
private bool _isRawStream;
|
||||
private long _fileOffset;
|
||||
|
@ -222,50 +226,6 @@ namespace Microsoft.PowerShell.Commands
|
|||
_suppressNewline = suppressNewline;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructor for the content stream
|
||||
/// </summary>
|
||||
/// <param name="path">
|
||||
/// The path to the file to get the content from.
|
||||
/// </param>
|
||||
/// <param name="mode">
|
||||
/// The file mode to open the file with.
|
||||
/// </param>
|
||||
/// <param name="access">
|
||||
/// The file access requested in the file.
|
||||
/// </param>
|
||||
/// <param name="share">
|
||||
/// The file share to open the file with
|
||||
/// </param>
|
||||
/// <param name="delimiter">
|
||||
/// The delimiter to use when reading strings. Each time read is called, all contents up to an including
|
||||
/// the delimiter is read.
|
||||
/// </param>
|
||||
/// <param name="encoding">
|
||||
/// The encoding of the file to be read or written.
|
||||
/// </param>
|
||||
/// <param name="waitForChanges">
|
||||
/// If true, we will perform blocking reads on the file, waiting for new content to be appended
|
||||
/// </param>
|
||||
/// <param name="provider">
|
||||
/// The CmdletProvider invoking this stream
|
||||
/// </param>
|
||||
/// <param name="isRawStream">
|
||||
/// Indicates raw stream.
|
||||
/// </param>
|
||||
public FileSystemContentReaderWriter(
|
||||
string path,
|
||||
FileMode mode,
|
||||
FileAccess access,
|
||||
FileShare share,
|
||||
string delimiter,
|
||||
Encoding encoding,
|
||||
bool waitForChanges,
|
||||
CmdletProvider provider,
|
||||
bool isRawStream) : this(path, null, mode, access, share, encoding, false, waitForChanges, provider, isRawStream)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructor for the content stream
|
||||
/// </summary>
|
||||
|
@ -314,8 +274,50 @@ namespace Microsoft.PowerShell.Commands
|
|||
bool isRawStream)
|
||||
: this(path, streamName, mode, access, share, encoding, false, waitForChanges, provider, isRawStream)
|
||||
{
|
||||
_delimiter = delimiter;
|
||||
_usingDelimiter = true;
|
||||
// If the delimiter is default ('\n') we'll use ReadLine() method.
|
||||
// Otherwise allocate temporary structures for ReadDelimited() method.
|
||||
if (!(delimiter.Length == 1 && delimiter[0] == DefaultDelimiter))
|
||||
{
|
||||
_delimiter = delimiter;
|
||||
_usingDelimiter = true;
|
||||
|
||||
// We expect that we are parsing files where line lengths can be relatively long.
|
||||
const int DefaultLineLength = 256;
|
||||
_currentLineContent = new StringBuilder(DefaultLineLength);
|
||||
|
||||
// For Boyer-Moore string search algorithm.
|
||||
// Populate the offset lookups.
|
||||
// These will tell us the maximum number of characters
|
||||
// we can read to generate another possible match (safe shift).
|
||||
// If we read more characters than this, we risk consuming
|
||||
// more of the stream than we need.
|
||||
//
|
||||
// Because an unicode character size is 2 byte we would to have use
|
||||
// very large array with 65535 size to keep this safe offsets.
|
||||
// One solution is to pack unicode character to byte.
|
||||
// The workaround is to use low byte from unicode character.
|
||||
// This allow us to use small array with size 256.
|
||||
// This workaround is the fastest and provides excellent results
|
||||
// in regular search scenarios when the file contains
|
||||
// mostly characters from the same alphabet.
|
||||
_offsetDictionary = new int[256];
|
||||
|
||||
// If next char from file is not in search pattern safe shift is the search pattern length.
|
||||
for (var n = 0; n < _offsetDictionary.Length; n++)
|
||||
{
|
||||
_offsetDictionary[n] = _delimiter.Length;
|
||||
}
|
||||
|
||||
// If next char from file is in search pattern we should calculate a safe shift.
|
||||
char currentChar;
|
||||
byte lowByte;
|
||||
for (var i = 0; i < _delimiter.Length; i++)
|
||||
{
|
||||
currentChar = _delimiter[i];
|
||||
lowByte = Unsafe.As<char, byte>(ref currentChar);
|
||||
_offsetDictionary[lowByte] = _delimiter.Length - i - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -578,103 +580,111 @@ namespace Microsoft.PowerShell.Commands
|
|||
|
||||
private bool ReadDelimited(bool waitChanges, ArrayList blocks, bool readBackward, string actualDelimiter)
|
||||
{
|
||||
if (_isRawStream)
|
||||
{
|
||||
// when -Raw is used we want to anyway read the whole thing
|
||||
// so avoiding the while loop by reading the entire content.
|
||||
string contentRead = _reader.ReadToEnd();
|
||||
if (contentRead.Length > 0)
|
||||
{
|
||||
blocks.Add(contentRead);
|
||||
}
|
||||
|
||||
// We already read whole file so return EOF.
|
||||
return false;
|
||||
}
|
||||
|
||||
// Since the delimiter is a string, we're essentially
|
||||
// dealing with a "find the substring" algorithm, but with
|
||||
// the additional restriction that we cannot read past the
|
||||
// end of the delimiter. If we read past the end of the delimiter,
|
||||
// end of the delimiter. If we read past the end of the delimiter,
|
||||
// then we'll eat up bytes that we need from the filestream.
|
||||
// The solution is a modified Boyer-Moore string search algorithm.
|
||||
// This version retains the sub-linear search performance (via the
|
||||
// lookup tables,) but offloads much of the dirty work to the
|
||||
// very efficient BCL String.IndexOf(, StringComparison.Ordinal) method.
|
||||
// lookup tables).
|
||||
int numRead = 0;
|
||||
int currentOffset = actualDelimiter.Length;
|
||||
StringBuilder content = new StringBuilder();
|
||||
|
||||
// Populate the offset lookups
|
||||
// These will tell us the maximum number of characters
|
||||
// we can read to generate another possible match.
|
||||
// If we read more characters than this, we risk consuming
|
||||
// more of the stream than we need.
|
||||
Dictionary<char, int> offsetDictionary = new Dictionary<char, int>();
|
||||
foreach (char currentChar in actualDelimiter)
|
||||
offsetDictionary[currentChar] = actualDelimiter.Length - actualDelimiter.LastIndexOf(currentChar) - 1;
|
||||
Span<char> readBuffer = stackalloc char[currentOffset];
|
||||
bool delimiterNotFound = true;
|
||||
_currentLineContent.Clear();
|
||||
|
||||
do
|
||||
{
|
||||
if (_isRawStream)
|
||||
{
|
||||
// when -Raw is used we want to anyway read the whole thing
|
||||
// so avoiding the while loop by reading the entire content.
|
||||
string contentRead = _reader.ReadToEnd();
|
||||
numRead = contentRead.Length;
|
||||
content.Append(contentRead);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Read in the required batch of characters
|
||||
var readBuffer = new char[currentOffset];
|
||||
numRead = readBackward
|
||||
? _backReader.Read(readBuffer, 0, currentOffset)
|
||||
: _reader.Read(readBuffer, 0, currentOffset);
|
||||
// Read in the required batch of characters
|
||||
numRead = readBackward
|
||||
? _backReader.Read(readBuffer.Slice(0, currentOffset))
|
||||
: _reader.Read(readBuffer.Slice(0, currentOffset));
|
||||
|
||||
// If we want to wait for changes, then we'll keep on attempting to read
|
||||
// until we fill the buffer.
|
||||
if (numRead == 0)
|
||||
// If we want to wait for changes, then we'll keep on attempting to read
|
||||
// until we fill the buffer.
|
||||
if (numRead == 0)
|
||||
{
|
||||
if (waitChanges)
|
||||
{
|
||||
if (waitChanges)
|
||||
// But stop reading if the provider is stopping
|
||||
while ((numRead < currentOffset) && (!_provider.Stopping))
|
||||
{
|
||||
// But stop reading if the provider is stopping
|
||||
while ((numRead < currentOffset) && (!_provider.Stopping))
|
||||
// Get the change, and try to read more characters
|
||||
// We only wait for changes when read forwards, so here we don't need to check if 'readBackward' is
|
||||
// true or false, we only use 'reader'. The member 'reader' will be updated by WaitForChanges.
|
||||
WaitForChanges(_path, _mode, _access, _share, _reader.CurrentEncoding);
|
||||
numRead += _reader.Read(readBuffer.Slice(0, (currentOffset - numRead)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numRead > 0)
|
||||
{
|
||||
_currentLineContent.Append(readBuffer.Slice(0, numRead));
|
||||
|
||||
// Look up the final character in our offset table.
|
||||
// If the character doesn't exist in the lookup table, then it's not in
|
||||
// our search key. That means the match must happen strictly /after/ the
|
||||
// current position. Because of that, we can feel confident reading in the
|
||||
// number of characters in the search key, without the risk of reading too many.
|
||||
var currentChar = _currentLineContent[_currentLineContent.Length - 1];
|
||||
currentOffset = _offsetDictionary[Unsafe.As<char, byte>(ref currentChar)];
|
||||
|
||||
// We want to keep reading if delimiter not found and we haven't hit the end of file
|
||||
delimiterNotFound = true;
|
||||
|
||||
// If the final letters matched, then we will get an offset of "0".
|
||||
// In that case, we'll either have a match (and break from the while loop,)
|
||||
// or we need to move the scan forward one position.
|
||||
if (currentOffset == 0)
|
||||
{
|
||||
currentOffset = 1;
|
||||
|
||||
if (actualDelimiter.Length <= _currentLineContent.Length)
|
||||
{
|
||||
delimiterNotFound = false;
|
||||
int i = 0;
|
||||
int j = _currentLineContent.Length - actualDelimiter.Length;
|
||||
for (; i < actualDelimiter.Length; i++, j++)
|
||||
{
|
||||
// Get the change, and try to read more characters
|
||||
// We only wait for changes when read forwards, so here we don't need to check if 'readBackward' is
|
||||
// true or false, we only use 'reader'. The member 'reader' will be updated by WaitForChanges.
|
||||
WaitForChanges(_path, _mode, _access, _share, _reader.CurrentEncoding);
|
||||
numRead += _reader.Read(readBuffer, 0, (currentOffset - numRead));
|
||||
if (actualDelimiter[i] != _currentLineContent[j])
|
||||
{
|
||||
delimiterNotFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numRead > 0)
|
||||
{
|
||||
content.Append(readBuffer, 0, numRead);
|
||||
|
||||
// Look up the final character in our offset table.
|
||||
// If the character doesn't exist in the lookup table, then it's not in
|
||||
// our search key. That means the match must happen strictly /after/ the
|
||||
// current position. Because of that, we can feel confident reading in the
|
||||
// number of characters in the search key, without the risk of reading too many.
|
||||
if (!offsetDictionary.TryGetValue(content[content.Length - 1], out currentOffset))
|
||||
currentOffset = actualDelimiter.Length;
|
||||
|
||||
// If the final letters matched, then we will get an offset of "0".
|
||||
// In that case, we'll either have a match (and break from the while loop,)
|
||||
// or we need to move the scan forward one position.
|
||||
if (currentOffset == 0)
|
||||
currentOffset = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Two cases where we want to keep reading:
|
||||
// 1. Raw stream and we haven't hit the end of file
|
||||
// 2. Delimiter not found and we haven't hit the end of file
|
||||
} while ((_isRawStream && (numRead != 0)) ||
|
||||
((content.ToString().IndexOf(actualDelimiter, StringComparison.Ordinal) < 0) && (numRead != 0)));
|
||||
} while (delimiterNotFound && (numRead != 0));
|
||||
|
||||
// We've reached the end of file or end of line.
|
||||
if (content.Length > 0)
|
||||
if (_currentLineContent.Length > 0)
|
||||
{
|
||||
// Add the block read to the ouptut array list, trimming a trailing delimiter, if present.
|
||||
// Note: If -Tail was specified, we get here in the course of 2 distinct passes:
|
||||
// - Once while reading backward simply to determine the appropriate *start position* for later forward reading, ignoring the content of the blocks read (in reverse).
|
||||
// - Then again during forward reading, for regular output processing; it is only then that trimming the delimiter is necessary.
|
||||
// (Trimming it during backward reading would not only be unnecessary, but could interfere with determining the correct start position.)
|
||||
string contentString = content.ToString();
|
||||
blocks.Add(
|
||||
!readBackward && contentString.EndsWith(actualDelimiter, StringComparison.Ordinal) && !_isRawStream
|
||||
? contentString.Substring(0, content.Length - actualDelimiter.Length)
|
||||
: contentString
|
||||
!readBackward && !delimiterNotFound
|
||||
? _currentLineContent.ToString(0, _currentLineContent.Length - actualDelimiter.Length)
|
||||
: _currentLineContent.ToString()
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -683,7 +693,7 @@ namespace Microsoft.PowerShell.Commands
|
|||
return true;
|
||||
else
|
||||
{
|
||||
if (readBackward && content.Length > 0)
|
||||
if (readBackward && _currentLineContent.Length > 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
@ -1259,14 +1269,32 @@ namespace Microsoft.PowerShell.Commands
|
|||
/// <summary>
|
||||
/// Read a specific maximum of characters from the current stream into a buffer
|
||||
/// </summary>
|
||||
/// <param name="buffer"></param>
|
||||
/// <param name="index"></param>
|
||||
/// <param name="count"></param>
|
||||
/// <param name="buffer">Output buffer.</param>
|
||||
/// <param name="index">Start position to write with.</param>
|
||||
/// <param name="count">Number of bytes to read.</param>
|
||||
/// <returns>Return the number of characters read, or -1 if we reach the head of the file</returns>
|
||||
/// <returns>Return the number of characters read, or -1 if we reach the head of the file</returns>
|
||||
public override int Read(char[] buffer, int index, int count)
|
||||
{
|
||||
return ReadSpan(new Span<char>(buffer, index, count));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Read characters from the current stream into a Span buffer.
|
||||
/// </summary>
|
||||
/// <param name="buffer">Output buffer.</param>
|
||||
/// <returns>Return the number of characters read, or -1 if we reach the head of the file.</returns>
|
||||
public override int Read(Span<char> buffer)
|
||||
{
|
||||
return ReadSpan(buffer);
|
||||
}
|
||||
|
||||
private int ReadSpan(Span<char> buffer)
|
||||
{
|
||||
// deal with the argument validation
|
||||
int charRead = 0;
|
||||
int index = 0;
|
||||
int count = buffer.Length;
|
||||
|
||||
do
|
||||
{
|
||||
|
@ -1284,7 +1312,8 @@ namespace Microsoft.PowerShell.Commands
|
|||
{
|
||||
buffer[index++] = _charBuff[--_charCount];
|
||||
}
|
||||
} while (count > 0);
|
||||
}
|
||||
while (count > 0);
|
||||
|
||||
return charRead;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue