From 810a9a55930951e547529a44e46c6f1829355fbf Mon Sep 17 00:00:00 2001
From: Rick Elrod <rick@elrod.me>
Date: Tue, 4 Aug 2020 12:49:45 -0500
Subject: [PATCH] find: Allow reading whole file for contains regex (#71083)

Change:
- Add a parameter `read_whole_file` which allows for reading the whole
  file when doing a `contains` regex search.
- This allows for (for example) matching a pattern at the very end of
  a file.

Test Plan:
- New integration tests

Tickets:
- Fixes #63378

Signed-off-by: Rick Elrod <rick@elrod.me>
---
 .../63378_find_module_regex_whole_file.yml    |  2 +
 lib/ansible/modules/find.py                   | 18 ++++-
 test/integration/targets/find/files/a.txt     |  2 +
 test/integration/targets/find/files/log.txt   |  4 +
 test/integration/targets/find/tasks/main.yml  | 79 +++++++++++++++++++
 5 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 changelogs/fragments/63378_find_module_regex_whole_file.yml
 create mode 100644 test/integration/targets/find/files/a.txt
 create mode 100644 test/integration/targets/find/files/log.txt

diff --git a/changelogs/fragments/63378_find_module_regex_whole_file.yml b/changelogs/fragments/63378_find_module_regex_whole_file.yml
new file mode 100644
index 00000000000..28012ece6cb
--- /dev/null
+++ b/changelogs/fragments/63378_find_module_regex_whole_file.yml
@@ -0,0 +1,2 @@
+minor_changes:
+  - find module - Now has a ``read_whole_file`` boolean parameter which allows for reading the whole file and doing an ``re.search()`` regex evaluation on it when searching using the ``contains`` option. This allows (for example) for ensuring the very end of the file matches a pattern.
diff --git a/lib/ansible/modules/find.py b/lib/ansible/modules/find.py
index cfefaaa9630..9e0da0c95e1 100644
--- a/lib/ansible/modules/find.py
+++ b/lib/ansible/modules/find.py
@@ -57,6 +57,15 @@ options:
         description:
             - A regular expression or pattern which should be matched against the file content.
         type: str
+    read_whole_file:
+        description:
+            - When doing a C(contains) search, determines whether the whole file should be read into
+              memory or if the regex should be applied to the file line-by-line.
+            - Setting this to C(true) can have performance and memory implications for large files.
+            - This uses C(re.search()) instead of C(re.match()).
+        type: bool
+        default: false
+        version_added: "2.11"
     paths:
         description:
             - List of paths of directories to search. All paths must be fully qualified.
@@ -283,11 +292,12 @@ def sizefilter(st, size):
     return False
 
 
-def contentfilter(fsname, pattern):
+def contentfilter(fsname, pattern, read_whole_file=False):
     """
     Filter files which contain the given expression
     :arg fsname: Filename to scan for lines matching a pattern
     :arg pattern: Pattern to look for inside of line
+    :arg read_whole_file: If true, the whole file is read into memory before the regex is applied against it. Otherwise, the regex is applied line-by-line.
     :rtype: bool
     :returns: True if one of the lines in fsname matches the pattern. Otherwise False
     """
@@ -298,6 +308,9 @@ def contentfilter(fsname, pattern):
 
     try:
         with open(fsname) as f:
+            if read_whole_file:
+                return bool(prog.search(f.read()))
+
             for line in f:
                 if prog.match(line):
                     return True
@@ -363,6 +376,7 @@ def main():
             patterns=dict(type='list', default=['*'], aliases=['pattern'], elements='str'),
             excludes=dict(type='list', aliases=['exclude'], elements='str'),
             contains=dict(type='str'),
+            read_whole_file=dict(type='bool', default=False),
             file_type=dict(type='str', default="file", choices=['any', 'directory', 'file', 'link']),
             age=dict(type='str'),
             age_stamp=dict(type='str', default="mtime", choices=['atime', 'ctime', 'mtime']),
@@ -445,7 +459,7 @@ def main():
                     elif stat.S_ISREG(st.st_mode) and params['file_type'] == 'file':
                         if pfilter(fsobj, params['patterns'], params['excludes'], params['use_regex']) and \
                            agefilter(st, now, age, params['age_stamp']) and \
-                           sizefilter(st, size) and contentfilter(fsname, params['contains']):
+                           sizefilter(st, size) and contentfilter(fsname, params['contains'], params['read_whole_file']):
 
                             r.update(statinfo(st))
                             if params['get_checksum']:
diff --git a/test/integration/targets/find/files/a.txt b/test/integration/targets/find/files/a.txt
new file mode 100644
index 00000000000..30b622a3b93
--- /dev/null
+++ b/test/integration/targets/find/files/a.txt
@@ -0,0 +1,2 @@
+this is a file that has
+a few lines in it
diff --git a/test/integration/targets/find/files/log.txt b/test/integration/targets/find/files/log.txt
new file mode 100644
index 00000000000..679893bcbf6
--- /dev/null
+++ b/test/integration/targets/find/files/log.txt
@@ -0,0 +1,4 @@
+01/01- OK
+01/02- OK
+01/03- KO
+01/04- OK
diff --git a/test/integration/targets/find/tasks/main.yml b/test/integration/targets/find/tasks/main.yml
index 456f4bc680d..cc718e92db2 100644
--- a/test/integration/targets/find/tasks/main.yml
+++ b/test/integration/targets/find/tasks/main.yml
@@ -114,3 +114,82 @@
           - 'find_test3.matched == 1'
           - 'find_test3.files[0].pw_name is defined'
           - 'find_test3.files[0].gr_name is defined'
+
+- name: Copy some files into the test dir
+  copy:
+      src: "{{ item }}"
+      dest: "{{ output_dir_test }}/{{ item }}"
+      mode: 0644
+  with_items:
+      - a.txt
+      - log.txt
+
+- name: Ensure '$' only matches the true end of the file with read_whole_file, not a line
+  find:
+      paths: "{{ output_dir_test }}"
+      patterns: "*.txt"
+      contains: "KO$"
+      read_whole_file: true
+  register: whole_no_match
+
+- debug: var=whole_no_match
+
+- assert:
+      that:
+          - whole_no_match.matched == 0
+
+- name: Match the end of the file successfully
+  find:
+      paths: "{{ output_dir_test }}"
+      patterns: "*.txt"
+      contains: "OK$"
+      read_whole_file: true
+  register: whole_match
+
+- debug: var=whole_match
+
+- assert:
+      that:
+          - whole_match.matched == 1
+
+- name: When read_whole_file=False, $ should match an individual line
+  find:
+      paths: "{{ output_dir_test }}"
+      patterns: "*.txt"
+      contains: ".*KO$"
+      read_whole_file: false
+  register: match_end_of_line
+
+- debug: var=match_end_of_line
+
+- assert:
+      that:
+          - match_end_of_line.matched == 1
+
+- name: When read_whole_file=True, match across line boundaries
+  find:
+      paths: "{{ output_dir_test }}"
+      patterns: "*.txt"
+      contains: "has\na few"
+      read_whole_file: true
+  register: match_line_boundaries
+
+- debug: var=match_line_boundaries
+
+- assert:
+      that:
+          - match_line_boundaries.matched == 1
+
+- name: When read_whole_file=False, do not match across line boundaries
+  find:
+      paths: "{{ output_dir_test }}"
+      patterns: "*.txt"
+      contains: "has\na few"
+      read_whole_file: false
+  register: no_match_line_boundaries
+
+- debug: var=no_match_line_boundaries
+
+- assert:
+      that:
+          - no_match_line_boundaries.matched == 0