From b8c08ba5add1406783cec2333d6ad7011a29e01f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= Date: Wed, 4 Aug 2021 11:20:36 +0200 Subject: [PATCH] Tests: Improve coverage for `File::get_csv_line()` Adds a few more complex edge cases which are supported. Also adds some documentation, simplifies the code a bit and forbids using double quotes as a delimiter. --- core/io/file_access.cpp | 37 +++++++++++++++++++------------------ doc/classes/File.xml | 11 +++++++++-- tests/data/translations.csv | 5 +++++ tests/test_file_access.h | 29 ++++++++++++++++++++++++----- 4 files changed, 57 insertions(+), 25 deletions(-) diff --git a/core/io/file_access.cpp b/core/io/file_access.cpp index d21c0bd9a2..e6e79dff8a 100644 --- a/core/io/file_access.cpp +++ b/core/io/file_access.cpp @@ -316,52 +316,53 @@ String FileAccess::get_line() const { } Vector FileAccess::get_csv_line(const String &p_delim) const { - ERR_FAIL_COND_V(p_delim.length() != 1, Vector()); + ERR_FAIL_COND_V_MSG(p_delim.length() != 1, Vector(), "Only single character delimiters are supported to parse CSV lines."); + ERR_FAIL_COND_V_MSG(p_delim[0] == '"', Vector(), "The double quotation mark character (\") is not supported as a delimiter for CSV lines."); - String l; + String line; + + // CSV can support entries with line breaks as long as they are enclosed + // in double quotes. So our "line" might be more than a single line in the + // text file. int qc = 0; do { if (eof_reached()) { break; } - - l += get_line() + "\n"; + line += get_line() + "\n"; qc = 0; - for (int i = 0; i < l.length(); i++) { - if (l[i] == '"') { + for (int i = 0; i < line.length(); i++) { + if (line[i] == '"') { qc++; } } - } while (qc % 2); - l = l.substr(0, l.length() - 1); + // Remove the extraneous newline we've added above. + line = line.substr(0, line.length() - 1); Vector strings; bool in_quote = false; String current; - for (int i = 0; i < l.length(); i++) { - char32_t c = l[i]; - char32_t s[2] = { 0, 0 }; - + for (int i = 0; i < line.length(); i++) { + char32_t c = line[i]; + // A delimiter ends the current entry, unless it's in a quoted string. if (!in_quote && c == p_delim[0]) { strings.push_back(current); current = String(); } else if (c == '"') { - if (l[i + 1] == '"' && in_quote) { - s[0] = '"'; - current += s; + // Doubled quotes are escapes for intentional quotes in the string. + if (line[i + 1] == '"' && in_quote) { + current += '"'; i++; } else { in_quote = !in_quote; } } else { - s[0] = c; - current += s; + current += c; } } - strings.push_back(current); return strings; diff --git a/doc/classes/File.xml b/doc/classes/File.xml index de3beedf0f..6622619fb3 100644 --- a/doc/classes/File.xml +++ b/doc/classes/File.xml @@ -119,8 +119,15 @@ - Returns the next value of the file in CSV (Comma-Separated Values) format. You can pass a different delimiter [code]delim[/code] to use other than the default [code]","[/code] (comma). This delimiter must be one-character long. - Text is interpreted as being UTF-8 encoded. + Returns the next value of the file in CSV (Comma-Separated Values) format. You can pass a different delimiter [code]delim[/code] to use other than the default [code]","[/code] (comma). This delimiter must be one-character long, and cannot be a double quotation mark. + Text is interpreted as being UTF-8 encoded. Text values must be enclosed in double quotes if they include the delimiter character. Double quotes within a text value can be escaped by doubling their occurrence. + For example, the following CSV lines are valid and will be properly parsed as two strings each: + [codeblock] + Alice,"Hello, Bob!" + Bob,Alice! What a surprise! + Alice,"I thought you'd reply with ""Hello, world""." + [/codeblock] + Note how the second line can omit the enclosing quotes as it does not include the delimiter. However it [i]could[/i] very well use quotes, it was only written without for demonstration purposes. The third line must use [code]""[/code] for each quotation mark that needs to be interpreted as such instead of the end of a text value. diff --git a/tests/data/translations.csv b/tests/data/translations.csv index 4c9ad4996a..8cb7b800c5 100644 --- a/tests/data/translations.csv +++ b/tests/data/translations.csv @@ -1,3 +1,8 @@ keys,en,de GOOD_MORNING,"Good Morning","Guten Morgen" GOOD_EVENING,"Good Evening","" +Without quotes,"With, comma","With ""inner"" quotes","With ""inner"", quotes"","" and comma","With ""inner +split"" quotes and +line breaks","With \nnewline chars" +Some other~delimiter~should still work, shouldn't it? +What about tab separated lines, good? diff --git a/tests/test_file_access.h b/tests/test_file_access.h index cb74e08a0d..b3da16c1d1 100644 --- a/tests/test_file_access.h +++ b/tests/test_file_access.h @@ -37,12 +37,12 @@ namespace TestFileAccess { TEST_CASE("[FileAccess] CSV read") { - FileAccess *f = FileAccess::open(TestUtils::get_data_path("translations.csv"), FileAccess::READ); + FileAccessRef f = FileAccess::open(TestUtils::get_data_path("translations.csv"), FileAccess::READ); - Vector header = f->get_csv_line(); // Default delimiter: "," + Vector header = f->get_csv_line(); // Default delimiter: ",". REQUIRE(header.size() == 3); - Vector row1 = f->get_csv_line(","); + Vector row1 = f->get_csv_line(","); // Explicit delimiter, should be the same. REQUIRE(row1.size() == 3); CHECK(row1[0] == "GOOD_MORNING"); CHECK(row1[1] == "Good Morning"); @@ -53,12 +53,31 @@ TEST_CASE("[FileAccess] CSV read") { CHECK(row2[0] == "GOOD_EVENING"); CHECK(row2[1] == "Good Evening"); CHECK(row2[2] == ""); // Use case: not yet translated! - // https://github.com/godotengine/godot/issues/44269 CHECK_MESSAGE(row2[2] != "\"", "Should not parse empty string as a single double quote."); + Vector row3 = f->get_csv_line(); + REQUIRE(row3.size() == 6); + CHECK(row3[0] == "Without quotes"); + CHECK(row3[1] == "With, comma"); + CHECK(row3[2] == "With \"inner\" quotes"); + CHECK(row3[3] == "With \"inner\", quotes\",\" and comma"); + CHECK(row3[4] == "With \"inner\nsplit\" quotes and\nline breaks"); + CHECK(row3[5] == "With \\nnewline chars"); // Escaped, not an actual newline. + + Vector row4 = f->get_csv_line("~"); // Custom delimiter, makes inline commas easier. + REQUIRE(row4.size() == 3); + CHECK(row4[0] == "Some other"); + CHECK(row4[1] == "delimiter"); + CHECK(row4[2] == "should still work, shouldn't it?"); + + Vector row5 = f->get_csv_line("\t"); // Tab separated variables. + REQUIRE(row5.size() == 3); + CHECK(row5[0] == "What about"); + CHECK(row5[1] == "tab separated"); + CHECK(row5[2] == "lines, good?"); + f->close(); - memdelete(f); } } // namespace TestFileAccess