Reduce allocations in Get-Content cmdlet (#8103)

* Simplify main loop in ReadDelimited() by bringing out raw stream code * Remove unused constructor * Allocate buffer for current line once in constructor * Use Span buffer to significantly reduce allocations * Optimize Boyer-Moore. Use simple mapping Unicode chars to byte. It works well for common text files with chars from one-two languages.
2018-10-26 08:55:54 +05:00 · 2018-10-26 08:55:54 +05:00 · 0e19042848
parent ddaad12183
commit 0e19042848
2 changed files with 153 additions and 123 deletions
--- a/src/System.Management.Automation/System.Management.Automation.csproj
+++ b/src/System.Management.Automation/System.Management.Automation.csproj
@ -16,6 +16,7 @@
    <PackageReference Include="System.DirectoryServices" Version="4.5.0" />
    <PackageReference Include="System.IO.FileSystem.AccessControl" Version="4.5.0" />
    <PackageReference Include="System.Management" Version="4.5.0" />
+    <PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="4.5.2" />
    <PackageReference Include="System.Security.AccessControl" Version="4.5.0" />
    <PackageReference Include="System.Security.Cryptography.Pkcs" Version="4.5.1" />
    <PackageReference Include="System.Security.Permissions" Version="4.5.0" />
--- a/src/System.Management.Automation/namespaces/FileSystemContentStream.cs
+++ b/src/System.Management.Automation/namespaces/FileSystemContentStream.cs
@ -6,11 +6,12 @@ using System.Collections;
 using System.Collections.Generic;
 using System.Diagnostics.CodeAnalysis;
 using System.IO;
-using System.Runtime.InteropServices;
-using System.Text;
 using System.Management.Automation;
 using System.Management.Automation.Internal;
 using System.Management.Automation.Provider;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Text;
 using System.Threading.Tasks;
 using Dbg = System.Management.Automation;

@ -54,8 +55,11 @@ namespace Microsoft.PowerShell.Commands
        private StreamReader _reader;
        private StreamWriter _writer;
        private bool _usingByteEncoding;
-        private string _delimiter = "\n";
+        private const char DefaultDelimiter = '\n';
+        private string _delimiter = $"{DefaultDelimiter}";
+        private int[] _offsetDictionary;
        private bool _usingDelimiter;
+        private StringBuilder _currentLineContent;
        private bool _waitForChanges;
        private bool _isRawStream;
        private long _fileOffset;
@ -222,50 +226,6 @@ namespace Microsoft.PowerShell.Commands
            _suppressNewline = suppressNewline;
        }

-        /// <summary>
-        /// Constructor for the content stream
-        /// </summary>
-        /// <param name="path">
-        /// The path to the file to get the content from.
-        /// </param>
-        /// <param name="mode">
-        /// The file mode to open the file with.
-        /// </param>
-        /// <param name="access">
-        /// The file access requested in the file.
-        /// </param>
-        ///  <param name="share">
-        ///    The file share to open the file with
-        ///  </param>
-        /// <param name="delimiter">
-        /// The delimiter to use when reading strings. Each time read is called, all contents up to an including
-        /// the delimiter is read.
-        /// </param>
-        /// <param name="encoding">
-        /// The encoding of the file to be read or written.
-        /// </param>
-        /// <param name="waitForChanges">
-        /// If true, we will perform blocking reads on the file, waiting for new content to be appended
-        /// </param>
-        /// <param name="provider">
-        /// The CmdletProvider invoking this stream
-        /// </param>
-        /// <param name="isRawStream">
-        /// Indicates raw stream.
-        /// </param>
-        public FileSystemContentReaderWriter(
-            string path,
-            FileMode mode,
-            FileAccess access,
-            FileShare share,
-            string delimiter,
-            Encoding encoding,
-            bool waitForChanges,
-            CmdletProvider provider,
-            bool isRawStream) : this(path, null, mode, access, share, encoding, false, waitForChanges, provider, isRawStream)
-        {
-        }
-
        /// <summary>
        /// Constructor for the content stream
        /// </summary>
@ -314,8 +274,50 @@ namespace Microsoft.PowerShell.Commands
            bool isRawStream)
            : this(path, streamName, mode, access, share, encoding, false, waitForChanges, provider, isRawStream)
        {
-            _delimiter = delimiter;
-            _usingDelimiter = true;
+            // If the delimiter is default ('\n') we'll use ReadLine() method.
+            // Otherwise allocate temporary structures for ReadDelimited() method.
+            if (!(delimiter.Length == 1 && delimiter[0] == DefaultDelimiter))
+            {
+                _delimiter = delimiter;
+                _usingDelimiter = true;
+
+                // We expect that we are parsing files where line lengths can be relatively long.
+                const int DefaultLineLength = 256;
+                _currentLineContent = new StringBuilder(DefaultLineLength);
+
+                // For Boyer-Moore string search algorithm.
+                // Populate the offset lookups.
+                // These will tell us the maximum number of characters
+                // we can read to generate another possible match (safe shift).
+                // If we read more characters than this, we risk consuming
+                // more of the stream than we need.
+                //
+                // Because an unicode character size is 2 byte we would to have use
+                // very large array with 65535 size to keep this safe offsets.
+                // One solution is to pack unicode character to byte.
+                // The workaround is to use low byte from unicode character.
+                // This allow us to use small array with size 256.
+                // This workaround is the fastest and provides excellent results
+                // in regular search scenarios when the file contains
+                // mostly characters from the same alphabet.
+                _offsetDictionary = new int[256];
+
+                // If next char from file is not in search pattern safe shift is the search pattern length.
+                for (var n = 0; n < _offsetDictionary.Length; n++)
+                {
+                    _offsetDictionary[n] = _delimiter.Length;
+                }
+
+                // If next char from file is in search pattern we should calculate a safe shift.
+                char currentChar;
+                byte lowByte;
+                for (var i = 0; i < _delimiter.Length; i++)
+                {
+                    currentChar = _delimiter[i];
+                    lowByte = Unsafe.As<char, byte>(ref currentChar);
+                    _offsetDictionary[lowByte] = _delimiter.Length - i - 1;
+                }
+            }
        }

        /// <summary>
@ -578,103 +580,111 @@ namespace Microsoft.PowerShell.Commands

        private bool ReadDelimited(bool waitChanges, ArrayList blocks, bool readBackward, string actualDelimiter)
        {
+            if (_isRawStream)
+            {
+                // when -Raw is used we want to anyway read the whole thing
+                // so avoiding the while loop by reading the entire content.
+                string contentRead = _reader.ReadToEnd();
+                if (contentRead.Length > 0)
+                {
+                    blocks.Add(contentRead);
+                }
+
+                // We already read whole file so return EOF.
+                return false;
+            }
+
            // Since the delimiter is a string, we're essentially
            // dealing with a "find the substring" algorithm, but with
            // the additional restriction that we cannot read past the
-            // end of the delimiter.  If we read past the end of the delimiter,
+            // end of the delimiter. If we read past the end of the delimiter,
            // then we'll eat up bytes that we need from the filestream.
            // The solution is a modified Boyer-Moore string search algorithm.
            // This version retains the sub-linear search performance (via the
-            // lookup tables,) but offloads much of the dirty work to the
-            // very efficient BCL String.IndexOf(, StringComparison.Ordinal) method.
+            // lookup tables).
            int numRead = 0;
            int currentOffset = actualDelimiter.Length;
-            StringBuilder content = new StringBuilder();
-
-            // Populate the offset lookups
-            // These will tell us the maximum number of characters
-            // we can read to generate another possible match.
-            // If we read more characters than this, we risk consuming
-            // more of the stream than we need.
-            Dictionary<char, int> offsetDictionary = new Dictionary<char, int>();
-            foreach (char currentChar in actualDelimiter)
-                offsetDictionary[currentChar] = actualDelimiter.Length - actualDelimiter.LastIndexOf(currentChar) - 1;
+            Span<char> readBuffer = stackalloc char[currentOffset];
+            bool delimiterNotFound = true;
+            _currentLineContent.Clear();

            do
            {
-                if (_isRawStream)
-                {
-                    // when -Raw is used we want to anyway read the whole thing
-                    // so avoiding the while loop by reading the entire content.
-                    string contentRead = _reader.ReadToEnd();
-                    numRead = contentRead.Length;
-                    content.Append(contentRead);
-                }
-                else
-                {
-                    // Read in the required batch of characters
-                    var readBuffer = new char[currentOffset];
-                    numRead = readBackward
-                                  ? _backReader.Read(readBuffer, 0, currentOffset)
-                                  : _reader.Read(readBuffer, 0, currentOffset);
+                // Read in the required batch of characters
+                numRead = readBackward
+                                ? _backReader.Read(readBuffer.Slice(0, currentOffset))
+                                : _reader.Read(readBuffer.Slice(0, currentOffset));

-                    // If we want to wait for changes, then we'll keep on attempting to read
-                    // until we fill the buffer.
-                    if (numRead == 0)
+                // If we want to wait for changes, then we'll keep on attempting to read
+                // until we fill the buffer.
+                if (numRead == 0)
+                {
+                    if (waitChanges)
                    {
-                        if (waitChanges)
+                        // But stop reading if the provider is stopping
+                        while ((numRead < currentOffset) && (!_provider.Stopping))
                        {
-                            // But stop reading if the provider is stopping
-                            while ((numRead < currentOffset) && (!_provider.Stopping))
+                            // Get the change, and try to read more characters
+                            // We only wait for changes when read forwards, so here we don't need to check if 'readBackward' is
+                            // true or false, we only use 'reader'. The member 'reader' will be updated by WaitForChanges.
+                            WaitForChanges(_path, _mode, _access, _share, _reader.CurrentEncoding);
+                            numRead += _reader.Read(readBuffer.Slice(0, (currentOffset - numRead)));
+                        }
+                    }
+                }
+
+                if (numRead > 0)
+                {
+                    _currentLineContent.Append(readBuffer.Slice(0, numRead));
+
+                    // Look up the final character in our offset table.
+                    // If the character doesn't exist in the lookup table, then it's not in
+                    // our search key.  That means the match must happen strictly /after/ the
+                    // current position.  Because of that, we can feel confident reading in the
+                    // number of characters in the search key, without the risk of reading too many.
+                    var currentChar = _currentLineContent[_currentLineContent.Length - 1];
+                    currentOffset = _offsetDictionary[Unsafe.As<char, byte>(ref currentChar)];
+
+                    // We want to keep reading if delimiter not found and we haven't hit the end of file
+                    delimiterNotFound = true;
+
+                    // If the final letters matched, then we will get an offset of "0".
+                    // In that case, we'll either have a match (and break from the while loop,)
+                    // or we need to move the scan forward one position.
+                    if (currentOffset == 0)
+                    {
+                        currentOffset = 1;
+
+                        if (actualDelimiter.Length <= _currentLineContent.Length)
+                        {
+                            delimiterNotFound = false;
+                            int i = 0;
+                            int j = _currentLineContent.Length - actualDelimiter.Length;
+                            for (; i < actualDelimiter.Length; i++, j++)
                            {
-                                // Get the change, and try to read more characters
-                                // We only wait for changes when read forwards, so here we don't need to check if 'readBackward' is
-                                // true or false, we only use 'reader'. The member 'reader' will be updated by WaitForChanges.
-                                WaitForChanges(_path, _mode, _access, _share, _reader.CurrentEncoding);
-                                numRead += _reader.Read(readBuffer, 0, (currentOffset - numRead));
+                                if (actualDelimiter[i] != _currentLineContent[j])
+                                {
+                                    delimiterNotFound = true;
+                                    break;
+                                }
                            }
                        }
                    }
-
-                    if (numRead > 0)
-                    {
-                        content.Append(readBuffer, 0, numRead);
-
-                        // Look up the final character in our offset table.
-                        // If the character doesn't exist in the lookup table, then it's not in
-                        // our search key.  That means the match must happen strictly /after/ the
-                        // current position.  Because of that, we can feel confident reading in the
-                        // number of characters in the search key, without the risk of reading too many.
-                        if (!offsetDictionary.TryGetValue(content[content.Length - 1], out currentOffset))
-                            currentOffset = actualDelimiter.Length;
-
-                        // If the final letters matched, then we will get an offset of "0".
-                        // In that case, we'll either have a match (and break from the while loop,)
-                        // or we need to move the scan forward one position.
-                        if (currentOffset == 0)
-                            currentOffset = 1;
-                    }
                }
-
-                // Two cases where we want to keep reading:
-                // 1. Raw stream and we haven't hit the end of file
-                // 2. Delimiter not found and we haven't hit the end of file
-            } while ((_isRawStream && (numRead != 0)) ||
-                ((content.ToString().IndexOf(actualDelimiter, StringComparison.Ordinal) < 0) && (numRead != 0)));
+            } while (delimiterNotFound && (numRead != 0));

            // We've reached the end of file or end of line.
-            if (content.Length > 0)
+            if (_currentLineContent.Length > 0)
            {
                // Add the block read to the ouptut array list, trimming a trailing delimiter, if present.
                // Note: If -Tail was specified, we get here in the course of 2 distinct passes:
                //  - Once while reading backward simply to determine the appropriate *start position* for later forward reading, ignoring the content of the blocks read (in reverse).
                //  - Then again during forward reading, for regular output processing; it is only then that trimming the delimiter is necessary.
                //    (Trimming it during backward reading would not only be unnecessary, but could interfere with determining the correct start position.)
-                string contentString = content.ToString();
                blocks.Add(
-                    !readBackward && contentString.EndsWith(actualDelimiter, StringComparison.Ordinal) && !_isRawStream
-                        ? contentString.Substring(0, content.Length - actualDelimiter.Length)
-                        : contentString
+                    !readBackward && !delimiterNotFound
+                        ? _currentLineContent.ToString(0, _currentLineContent.Length - actualDelimiter.Length)
+                        : _currentLineContent.ToString()
                );
            }

@ -683,7 +693,7 @@ namespace Microsoft.PowerShell.Commands
                return true;
            else
            {
-                if (readBackward && content.Length > 0)
+                if (readBackward && _currentLineContent.Length > 0)
                {
                    return true;
                }
@ -1259,14 +1269,32 @@ namespace Microsoft.PowerShell.Commands
        /// <summary>
        /// Read a specific maximum of characters from the current stream into a buffer
        /// </summary>
-        /// <param name="buffer"></param>
-        /// <param name="index"></param>
-        /// <param name="count"></param>
+        /// <param name="buffer">Output buffer.</param>
+        /// <param name="index">Start position to write with.</param>
+        /// <param name="count">Number of bytes to read.</param>
+        /// <returns>Return the number of characters read, or -1 if we reach the head of the file</returns>
        /// <returns>Return the number of characters read, or -1 if we reach the head of the file</returns>
        public override int Read(char[] buffer, int index, int count)
+        {
+            return ReadSpan(new Span<char>(buffer, index, count));
+        }
+
+        /// <summary>
+        /// Read characters from the current stream into a Span buffer.
+        /// </summary>
+        /// <param name="buffer">Output buffer.</param>
+        /// <returns>Return the number of characters read, or -1 if we reach the head of the file.</returns>
+        public override int Read(Span<char> buffer)
+        {
+            return ReadSpan(buffer);
+        }
+
+        private int ReadSpan(Span<char> buffer)
        {
            // deal with the argument validation
            int charRead = 0;
+            int index = 0;
+            int count = buffer.Length;

            do
            {
@ -1284,7 +1312,8 @@ namespace Microsoft.PowerShell.Commands
                {
                    buffer[index++] = _charBuff[--_charCount];
                }
-            } while (count > 0);
+            }
+            while (count > 0);

            return charRead;
        }