| | 1 | | using System.Text; |
| | 2 | | using ValidateLib.ErrorsAndWarnings.Errors; |
| | 3 | | using ValidateLib.TabularData.AnnotatedTabularDataModel; |
| | 4 | |
|
| | 5 | | namespace ValidateLib.TabularData.Parsing |
| | 6 | | { |
| | 7 | | record class Params |
| | 8 | | { |
| 1 | 9 | | public string rowContent { get; set; } |
| 1 | 10 | | public int position { get; set; } |
| | 11 | | } |
| | 12 | | /// <summary> |
| | 13 | | /// Parses one row from the CSV file and returns fields contained in this row. |
| | 14 | | /// Implements this algorithm: https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/#dfn-parse-a-row |
| | 15 | | /// </summary> |
| | 16 | | public class RowParser : Reader |
| | 17 | | { |
| | 18 | | List<string> cellValues = new List<string>(); |
| | 19 | | StringBuilder currentCellValue = new StringBuilder(); |
| | 20 | | string trimmedCellValue = ""; |
| | 21 | | protected bool defaultQuoteCase = true; |
| | 22 | | bool quoted = false; |
| | 23 | | Row? _row; |
| | 24 | | bool charOutsideQuotesAdded = false; |
| | 25 | |
|
| | 26 | | public RowParser(Flags flags) : base(flags) |
| | 27 | | { |
| | 28 | | if (flags.quoteCharacter != "\"") |
| | 29 | | { |
| | 30 | | defaultQuoteCase = false; |
| | 31 | | } |
| | 32 | | } |
| | 33 | | public List<string> ParseRow(string row, Row? annotatedRow = null) |
| | 34 | | { |
| | 35 | | _row = annotatedRow; |
| | 36 | | cellValues = new List<string>(); |
| | 37 | | currentCellValue = new StringBuilder(); |
| | 38 | | quoted = false; |
| | 39 | |
|
| | 40 | | Params parameters = new Params() |
| | 41 | | { |
| | 42 | | rowContent = row, |
| | 43 | | position = 0 |
| | 44 | | }; |
| | 45 | |
|
| | 46 | | while (isPositionInBound(parameters)) |
| | 47 | | { |
| | 48 | | ProcessChar(parameters); |
| | 49 | | } |
| | 50 | | AddCurrentCell(); |
| | 51 | | return cellValues; |
| | 52 | |
|
| | 53 | |
|
| | 54 | | } |
| | 55 | | bool isPositionInBound(string row, int position) => position < row.Length; |
| | 56 | | bool isPositionInBound(Params parameters) => parameters.position < parameters.rowContent.Length; |
| | 57 | | string getSubStringStartingFromPosition(Params parameters) => parameters.rowContent.Substring(parameters.positio |
| | 58 | | private char GetCharAtPosition(Params parameters) => parameters.rowContent[parameters.position]; |
| | 59 | | private bool IsQuote(Params parameters) |
| | 60 | | { |
| | 61 | | if (defaultQuoteCase) |
| | 62 | | { |
| | 63 | | return parameters.rowContent[parameters.position] == '\"'; |
| | 64 | | } |
| | 65 | | return getSubStringStartingFromPosition(parameters).StartsWith(flags.quoteCharacter); |
| | 66 | | } |
| | 67 | | private bool IsDelimiter(Params parameters) |
| | 68 | | { |
| | 69 | | int positionInDelimiter = 0; |
| | 70 | | int positionInParams = parameters.position; |
| | 71 | | while (positionInParams < parameters.rowContent.Length && positionInDelimiter < flags.delimiter.Length) |
| | 72 | | { |
| | 73 | | if (flags.delimiter[positionInDelimiter] != parameters.rowContent[positionInParams]) |
| | 74 | | return false; |
| | 75 | | positionInDelimiter++; |
| | 76 | | positionInParams++; |
| | 77 | | } |
| | 78 | | return true; |
| | 79 | | } |
| | 80 | |
|
| | 81 | | bool ProcessChar(Params parameters) |
| | 82 | | { |
| | 83 | | char currentChar = parameters.rowContent[parameters.position]; |
| | 84 | | if (currentChar == flags.escapeCharacter) |
| | 85 | | { |
| | 86 | | HandleEscapeChar(parameters); |
| | 87 | | } |
| | 88 | | else if (IsQuote(parameters)) |
| | 89 | | { |
| | 90 | | HandleQuoteChar(parameters); |
| | 91 | | } |
| | 92 | | else if (IsDelimiter(parameters)) |
| | 93 | | { |
| | 94 | | HandleDelimiter(parameters); |
| | 95 | | } |
| | 96 | | else |
| | 97 | | { |
| | 98 | | currentCellValue.Append(currentChar); |
| | 99 | | parameters.position++; |
| | 100 | | } |
| | 101 | | return false; |
| | 102 | | } |
| | 103 | |
|
| | 104 | | private void HandleDelimiter(Params parameters) |
| | 105 | | { |
| | 106 | | if (quoted) |
| | 107 | | { |
| | 108 | | currentCellValue.Append(GetCharAtPosition(parameters)); |
| | 109 | | parameters.position++; |
| | 110 | | } |
| | 111 | | else |
| | 112 | | { |
| | 113 | | AddCurrentCell(); |
| | 114 | | currentCellValue = new StringBuilder(); |
| | 115 | | parameters.position++; |
| | 116 | | } |
| | 117 | | } |
| | 118 | |
|
| | 119 | | private void HandleEscapeChar(Params parameters) |
| | 120 | | { |
| | 121 | | if (IsEscapeFollowedByQuote(parameters.rowContent, parameters.position)) |
| | 122 | | { |
| | 123 | | currentCellValue.Append(flags.quoteCharacter); |
| | 124 | | parameters.position += 2; |
| | 125 | | } |
| | 126 | | else if (flags.quoteCharacter != flags.escapeCharacter.ToString()) |
| | 127 | | { |
| | 128 | | if (isPositionInBound(parameters.rowContent, parameters.position + 1)) |
| | 129 | | { |
| | 130 | | char charToAppend = parameters.rowContent[parameters.position + 1]; |
| | 131 | | currentCellValue.Append(charToAppend); |
| | 132 | | } |
| | 133 | | parameters.position += 2; |
| | 134 | |
|
| | 135 | | } |
| | 136 | | else |
| | 137 | | { |
| | 138 | | HandleQuoteChar(parameters); |
| | 139 | | } |
| | 140 | | } |
| | 141 | |
|
| | 142 | | private void HandleQuoteChar(Params parameters) |
| | 143 | | { |
| | 144 | | if (!quoted) |
| | 145 | | { |
| | 146 | | quoted = true; |
| | 147 | |
|
| | 148 | |
|
| | 149 | | if (currentCellValue.Length > 0 && currentCellValue.ToString() != flags.quoteCharacter) |
| | 150 | | { |
| | 151 | | if (!charOutsideQuotesAdded) |
| | 152 | | { |
| | 153 | | _row?.errors?.Add(ErrorFactory.GetCharOutsideQuotesVE(_row.table.url, _row.sourceNumber, cellVal |
| | 154 | | charOutsideQuotesAdded = true; |
| | 155 | | } |
| | 156 | | } |
| | 157 | |
|
| | 158 | |
|
| | 159 | | parameters.position++; |
| | 160 | | } |
| | 161 | | else |
| | 162 | | { |
| | 163 | | quoted = false; |
| | 164 | | parameters.position++; |
| | 165 | |
|
| | 166 | | if (isPositionInBound(parameters) |
| | 167 | | && !IsDelimiter(parameters) |
| | 168 | | && !IsQuote(parameters)) |
| | 169 | | { |
| | 170 | | if (!charOutsideQuotesAdded) |
| | 171 | | { |
| | 172 | | _row?.errors?.Add(ErrorFactory.GetCharOutsideQuotesVE(_row.table.url, _row.sourceNumber, cellVal |
| | 173 | | charOutsideQuotesAdded = true; |
| | 174 | | } |
| | 175 | | } |
| | 176 | |
|
| | 177 | |
|
| | 178 | | } |
| | 179 | |
|
| | 180 | | } |
| | 181 | | private bool IsEscapeFollowedByQuote(string rowContent, int position) |
| | 182 | | { |
| | 183 | | string escapeFollowedByQuote = flags.escapeCharacter.ToString() + flags.quoteCharacter; |
| | 184 | | if (position >= 0 && position + escapeFollowedByQuote.Length <= rowContent.Length) |
| | 185 | | { |
| | 186 | | string extractedSubstring = rowContent.Substring(position, escapeFollowedByQuote.Length); |
| | 187 | | return extractedSubstring.Equals(escapeFollowedByQuote); |
| | 188 | | } |
| | 189 | |
|
| | 190 | | return false; |
| | 191 | | } |
| | 192 | |
|
| | 193 | | private void ConditionallyTrimCellValue() |
| | 194 | | { |
| | 195 | | trimmedCellValue = currentCellValue.ToString(); |
| | 196 | | if (flags.trim == TrimOptions.START || flags.trim == TrimOptions.TRUE) |
| | 197 | | { |
| | 198 | | trimmedCellValue = trimmedCellValue.TrimStart(); |
| | 199 | | } |
| | 200 | | if (flags.trim == TrimOptions.END || flags.trim == TrimOptions.TRUE) |
| | 201 | | { |
| | 202 | | trimmedCellValue = trimmedCellValue.TrimEnd(); |
| | 203 | | } |
| | 204 | | } |
| | 205 | |
|
| | 206 | | private void AddCurrentCell() |
| | 207 | | { |
| | 208 | | charOutsideQuotesAdded = false; |
| | 209 | | ConditionallyTrimCellValue(); |
| | 210 | | cellValues.Add(trimmedCellValue); |
| | 211 | | } |
| | 212 | | } |
| | 213 | | } |