Reduce allocations in Get-Content cmdlet (#8103)

* Simplify main loop in ReadDelimited() by bringing out raw stream code
* Remove unused constructor
* Allocate buffer for current line once in constructor
* Use Span buffer to significantly reduce allocations
* Optimize Boyer-Moore. Use simple mapping Unicode chars to byte. It works well for common text files with chars from one-two languages.
This commit is contained in:
Ilya 2018-10-26 08:55:54 +05:00 committed by GitHub
parent ddaad12183
commit 0e19042848
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 153 additions and 123 deletions

View file

@ -16,6 +16,7 @@
<PackageReference Include="System.DirectoryServices" Version="4.5.0" />
<PackageReference Include="System.IO.FileSystem.AccessControl" Version="4.5.0" />
<PackageReference Include="System.Management" Version="4.5.0" />
<PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="4.5.2" />
<PackageReference Include="System.Security.AccessControl" Version="4.5.0" />
<PackageReference Include="System.Security.Cryptography.Pkcs" Version="4.5.1" />
<PackageReference Include="System.Security.Permissions" Version="4.5.0" />

View file

@ -6,11 +6,12 @@ using System.Collections;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
using System.Management.Automation;
using System.Management.Automation.Internal;
using System.Management.Automation.Provider;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading.Tasks;
using Dbg = System.Management.Automation;
@ -54,8 +55,11 @@ namespace Microsoft.PowerShell.Commands
private StreamReader _reader;
private StreamWriter _writer;
private bool _usingByteEncoding;
private string _delimiter = "\n";
private const char DefaultDelimiter = '\n';
private string _delimiter = $"{DefaultDelimiter}";
private int[] _offsetDictionary;
private bool _usingDelimiter;
private StringBuilder _currentLineContent;
private bool _waitForChanges;
private bool _isRawStream;
private long _fileOffset;
@ -222,50 +226,6 @@ namespace Microsoft.PowerShell.Commands
_suppressNewline = suppressNewline;
}
/// <summary>
/// Constructor for the content stream
/// </summary>
/// <param name="path">
/// The path to the file to get the content from.
/// </param>
/// <param name="mode">
/// The file mode to open the file with.
/// </param>
/// <param name="access">
/// The file access requested in the file.
/// </param>
/// <param name="share">
/// The file share to open the file with
/// </param>
/// <param name="delimiter">
/// The delimiter to use when reading strings. Each time read is called, all contents up to an including
/// the delimiter is read.
/// </param>
/// <param name="encoding">
/// The encoding of the file to be read or written.
/// </param>
/// <param name="waitForChanges">
/// If true, we will perform blocking reads on the file, waiting for new content to be appended
/// </param>
/// <param name="provider">
/// The CmdletProvider invoking this stream
/// </param>
/// <param name="isRawStream">
/// Indicates raw stream.
/// </param>
public FileSystemContentReaderWriter(
string path,
FileMode mode,
FileAccess access,
FileShare share,
string delimiter,
Encoding encoding,
bool waitForChanges,
CmdletProvider provider,
bool isRawStream) : this(path, null, mode, access, share, encoding, false, waitForChanges, provider, isRawStream)
{
}
/// <summary>
/// Constructor for the content stream
/// </summary>
@ -314,8 +274,50 @@ namespace Microsoft.PowerShell.Commands
bool isRawStream)
: this(path, streamName, mode, access, share, encoding, false, waitForChanges, provider, isRawStream)
{
_delimiter = delimiter;
_usingDelimiter = true;
// If the delimiter is default ('\n') we'll use ReadLine() method.
// Otherwise allocate temporary structures for ReadDelimited() method.
if (!(delimiter.Length == 1 && delimiter[0] == DefaultDelimiter))
{
_delimiter = delimiter;
_usingDelimiter = true;
// We expect that we are parsing files where line lengths can be relatively long.
const int DefaultLineLength = 256;
_currentLineContent = new StringBuilder(DefaultLineLength);
// For Boyer-Moore string search algorithm.
// Populate the offset lookups.
// These will tell us the maximum number of characters
// we can read to generate another possible match (safe shift).
// If we read more characters than this, we risk consuming
// more of the stream than we need.
//
// Because an unicode character size is 2 byte we would to have use
// very large array with 65535 size to keep this safe offsets.
// One solution is to pack unicode character to byte.
// The workaround is to use low byte from unicode character.
// This allow us to use small array with size 256.
// This workaround is the fastest and provides excellent results
// in regular search scenarios when the file contains
// mostly characters from the same alphabet.
_offsetDictionary = new int[256];
// If next char from file is not in search pattern safe shift is the search pattern length.
for (var n = 0; n < _offsetDictionary.Length; n++)
{
_offsetDictionary[n] = _delimiter.Length;
}
// If next char from file is in search pattern we should calculate a safe shift.
char currentChar;
byte lowByte;
for (var i = 0; i < _delimiter.Length; i++)
{
currentChar = _delimiter[i];
lowByte = Unsafe.As<char, byte>(ref currentChar);
_offsetDictionary[lowByte] = _delimiter.Length - i - 1;
}
}
}
/// <summary>
@ -578,103 +580,111 @@ namespace Microsoft.PowerShell.Commands
private bool ReadDelimited(bool waitChanges, ArrayList blocks, bool readBackward, string actualDelimiter)
{
if (_isRawStream)
{
// when -Raw is used we want to anyway read the whole thing
// so avoiding the while loop by reading the entire content.
string contentRead = _reader.ReadToEnd();
if (contentRead.Length > 0)
{
blocks.Add(contentRead);
}
// We already read whole file so return EOF.
return false;
}
// Since the delimiter is a string, we're essentially
// dealing with a "find the substring" algorithm, but with
// the additional restriction that we cannot read past the
// end of the delimiter. If we read past the end of the delimiter,
// end of the delimiter. If we read past the end of the delimiter,
// then we'll eat up bytes that we need from the filestream.
// The solution is a modified Boyer-Moore string search algorithm.
// This version retains the sub-linear search performance (via the
// lookup tables,) but offloads much of the dirty work to the
// very efficient BCL String.IndexOf(, StringComparison.Ordinal) method.
// lookup tables).
int numRead = 0;
int currentOffset = actualDelimiter.Length;
StringBuilder content = new StringBuilder();
// Populate the offset lookups
// These will tell us the maximum number of characters
// we can read to generate another possible match.
// If we read more characters than this, we risk consuming
// more of the stream than we need.
Dictionary<char, int> offsetDictionary = new Dictionary<char, int>();
foreach (char currentChar in actualDelimiter)
offsetDictionary[currentChar] = actualDelimiter.Length - actualDelimiter.LastIndexOf(currentChar) - 1;
Span<char> readBuffer = stackalloc char[currentOffset];
bool delimiterNotFound = true;
_currentLineContent.Clear();
do
{
if (_isRawStream)
{
// when -Raw is used we want to anyway read the whole thing
// so avoiding the while loop by reading the entire content.
string contentRead = _reader.ReadToEnd();
numRead = contentRead.Length;
content.Append(contentRead);
}
else
{
// Read in the required batch of characters
var readBuffer = new char[currentOffset];
numRead = readBackward
? _backReader.Read(readBuffer, 0, currentOffset)
: _reader.Read(readBuffer, 0, currentOffset);
// Read in the required batch of characters
numRead = readBackward
? _backReader.Read(readBuffer.Slice(0, currentOffset))
: _reader.Read(readBuffer.Slice(0, currentOffset));
// If we want to wait for changes, then we'll keep on attempting to read
// until we fill the buffer.
if (numRead == 0)
// If we want to wait for changes, then we'll keep on attempting to read
// until we fill the buffer.
if (numRead == 0)
{
if (waitChanges)
{
if (waitChanges)
// But stop reading if the provider is stopping
while ((numRead < currentOffset) && (!_provider.Stopping))
{
// But stop reading if the provider is stopping
while ((numRead < currentOffset) && (!_provider.Stopping))
// Get the change, and try to read more characters
// We only wait for changes when read forwards, so here we don't need to check if 'readBackward' is
// true or false, we only use 'reader'. The member 'reader' will be updated by WaitForChanges.
WaitForChanges(_path, _mode, _access, _share, _reader.CurrentEncoding);
numRead += _reader.Read(readBuffer.Slice(0, (currentOffset - numRead)));
}
}
}
if (numRead > 0)
{
_currentLineContent.Append(readBuffer.Slice(0, numRead));
// Look up the final character in our offset table.
// If the character doesn't exist in the lookup table, then it's not in
// our search key. That means the match must happen strictly /after/ the
// current position. Because of that, we can feel confident reading in the
// number of characters in the search key, without the risk of reading too many.
var currentChar = _currentLineContent[_currentLineContent.Length - 1];
currentOffset = _offsetDictionary[Unsafe.As<char, byte>(ref currentChar)];
// We want to keep reading if delimiter not found and we haven't hit the end of file
delimiterNotFound = true;
// If the final letters matched, then we will get an offset of "0".
// In that case, we'll either have a match (and break from the while loop,)
// or we need to move the scan forward one position.
if (currentOffset == 0)
{
currentOffset = 1;
if (actualDelimiter.Length <= _currentLineContent.Length)
{
delimiterNotFound = false;
int i = 0;
int j = _currentLineContent.Length - actualDelimiter.Length;
for (; i < actualDelimiter.Length; i++, j++)
{
// Get the change, and try to read more characters
// We only wait for changes when read forwards, so here we don't need to check if 'readBackward' is
// true or false, we only use 'reader'. The member 'reader' will be updated by WaitForChanges.
WaitForChanges(_path, _mode, _access, _share, _reader.CurrentEncoding);
numRead += _reader.Read(readBuffer, 0, (currentOffset - numRead));
if (actualDelimiter[i] != _currentLineContent[j])
{
delimiterNotFound = true;
break;
}
}
}
}
if (numRead > 0)
{
content.Append(readBuffer, 0, numRead);
// Look up the final character in our offset table.
// If the character doesn't exist in the lookup table, then it's not in
// our search key. That means the match must happen strictly /after/ the
// current position. Because of that, we can feel confident reading in the
// number of characters in the search key, without the risk of reading too many.
if (!offsetDictionary.TryGetValue(content[content.Length - 1], out currentOffset))
currentOffset = actualDelimiter.Length;
// If the final letters matched, then we will get an offset of "0".
// In that case, we'll either have a match (and break from the while loop,)
// or we need to move the scan forward one position.
if (currentOffset == 0)
currentOffset = 1;
}
}
// Two cases where we want to keep reading:
// 1. Raw stream and we haven't hit the end of file
// 2. Delimiter not found and we haven't hit the end of file
} while ((_isRawStream && (numRead != 0)) ||
((content.ToString().IndexOf(actualDelimiter, StringComparison.Ordinal) < 0) && (numRead != 0)));
} while (delimiterNotFound && (numRead != 0));
// We've reached the end of file or end of line.
if (content.Length > 0)
if (_currentLineContent.Length > 0)
{
// Add the block read to the ouptut array list, trimming a trailing delimiter, if present.
// Note: If -Tail was specified, we get here in the course of 2 distinct passes:
// - Once while reading backward simply to determine the appropriate *start position* for later forward reading, ignoring the content of the blocks read (in reverse).
// - Then again during forward reading, for regular output processing; it is only then that trimming the delimiter is necessary.
// (Trimming it during backward reading would not only be unnecessary, but could interfere with determining the correct start position.)
string contentString = content.ToString();
blocks.Add(
!readBackward && contentString.EndsWith(actualDelimiter, StringComparison.Ordinal) && !_isRawStream
? contentString.Substring(0, content.Length - actualDelimiter.Length)
: contentString
!readBackward && !delimiterNotFound
? _currentLineContent.ToString(0, _currentLineContent.Length - actualDelimiter.Length)
: _currentLineContent.ToString()
);
}
@ -683,7 +693,7 @@ namespace Microsoft.PowerShell.Commands
return true;
else
{
if (readBackward && content.Length > 0)
if (readBackward && _currentLineContent.Length > 0)
{
return true;
}
@ -1259,14 +1269,32 @@ namespace Microsoft.PowerShell.Commands
/// <summary>
/// Read a specific maximum of characters from the current stream into a buffer
/// </summary>
/// <param name="buffer"></param>
/// <param name="index"></param>
/// <param name="count"></param>
/// <param name="buffer">Output buffer.</param>
/// <param name="index">Start position to write with.</param>
/// <param name="count">Number of bytes to read.</param>
/// <returns>Return the number of characters read, or -1 if we reach the head of the file</returns>
/// <returns>Return the number of characters read, or -1 if we reach the head of the file</returns>
public override int Read(char[] buffer, int index, int count)
{
return ReadSpan(new Span<char>(buffer, index, count));
}
/// <summary>
/// Read characters from the current stream into a Span buffer.
/// </summary>
/// <param name="buffer">Output buffer.</param>
/// <returns>Return the number of characters read, or -1 if we reach the head of the file.</returns>
public override int Read(Span<char> buffer)
{
return ReadSpan(buffer);
}
private int ReadSpan(Span<char> buffer)
{
// deal with the argument validation
int charRead = 0;
int index = 0;
int count = buffer.Length;
do
{
@ -1284,7 +1312,8 @@ namespace Microsoft.PowerShell.Commands
{
buffer[index++] = _charBuff[--_charCount];
}
} while (count > 0);
}
while (count > 0);
return charRead;
}