| | | 1 | | using System.Text; |
| | | 2 | | using ValidateLib.ErrorsAndWarnings.Errors; |
| | | 3 | | using ValidateLib.TabularData.AnnotatedTabularDataModel; |
| | | 4 | | |
| | | 5 | | namespace ValidateLib.TabularData.Parsing |
| | | 6 | | { |
| | | 7 | | record class Params |
| | | 8 | | { |
| | | 9 | | public string rowContent { get; set; } |
| | | 10 | | public int position { get; set; } |
| | | 11 | | } |
| | | 12 | | /// <summary> |
| | | 13 | | /// Parses one row from the CSV file and returns fields contained in this row. |
| | | 14 | | /// Implements this algorithm: https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/#dfn-parse-a-row |
| | | 15 | | /// </summary> |
| | | 16 | | public class RowParser : Reader |
| | | 17 | | { |
| | 1 | 18 | | List<string> cellValues = new List<string>(); |
| | 1 | 19 | | StringBuilder currentCellValue = new StringBuilder(); |
| | 1 | 20 | | string trimmedCellValue = ""; |
| | 1 | 21 | | protected bool defaultQuoteCase = true; |
| | | 22 | | bool quoted = false; |
| | | 23 | | Row? _row; |
| | | 24 | | bool charOutsideQuotesAdded = false; |
| | | 25 | | |
| | 1 | 26 | | public RowParser(Flags flags) : base(flags) |
| | | 27 | | { |
| | 1 | 28 | | if (flags.quoteCharacter != "\"") |
| | | 29 | | { |
| | 0 | 30 | | defaultQuoteCase = false; |
| | | 31 | | } |
| | 1 | 32 | | } |
| | | 33 | | public List<string> ParseRow(string row, Row? annotatedRow = null) |
| | | 34 | | { |
| | 1 | 35 | | _row = annotatedRow; |
| | 1 | 36 | | cellValues = new List<string>(); |
| | 1 | 37 | | currentCellValue = new StringBuilder(); |
| | 1 | 38 | | quoted = false; |
| | | 39 | | |
| | 1 | 40 | | Params parameters = new Params() |
| | 1 | 41 | | { |
| | 1 | 42 | | rowContent = row, |
| | 1 | 43 | | position = 0 |
| | 1 | 44 | | }; |
| | | 45 | | |
| | 1 | 46 | | while (isPositionInBound(parameters)) |
| | | 47 | | { |
| | 1 | 48 | | ProcessChar(parameters); |
| | | 49 | | } |
| | 1 | 50 | | AddCurrentCell(); |
| | 1 | 51 | | return cellValues; |
| | | 52 | | |
| | | 53 | | |
| | | 54 | | } |
| | 1 | 55 | | bool isPositionInBound(string row, int position) => position < row.Length; |
| | 1 | 56 | | bool isPositionInBound(Params parameters) => parameters.position < parameters.rowContent.Length; |
| | 0 | 57 | | string getSubStringStartingFromPosition(Params parameters) => parameters.rowContent.Substring(parameters.positio |
| | 1 | 58 | | private char GetCharAtPosition(Params parameters) => parameters.rowContent[parameters.position]; |
| | | 59 | | private bool IsQuote(Params parameters) |
| | | 60 | | { |
| | 1 | 61 | | if (defaultQuoteCase) |
| | | 62 | | { |
| | 1 | 63 | | return parameters.rowContent[parameters.position] == '\"'; |
| | | 64 | | } |
| | 0 | 65 | | return getSubStringStartingFromPosition(parameters).StartsWith(flags.quoteCharacter); |
| | | 66 | | } |
| | | 67 | | private bool IsDelimiter(Params parameters) |
| | | 68 | | { |
| | 1 | 69 | | int positionInDelimiter = 0; |
| | 1 | 70 | | int positionInParams = parameters.position; |
| | 1 | 71 | | while (positionInParams < parameters.rowContent.Length && positionInDelimiter < flags.delimiter.Length) |
| | | 72 | | { |
| | 1 | 73 | | if (flags.delimiter[positionInDelimiter] != parameters.rowContent[positionInParams]) |
| | 1 | 74 | | return false; |
| | 1 | 75 | | positionInDelimiter++; |
| | 1 | 76 | | positionInParams++; |
| | | 77 | | } |
| | 1 | 78 | | return true; |
| | | 79 | | } |
| | | 80 | | |
| | | 81 | | bool ProcessChar(Params parameters) |
| | | 82 | | { |
| | 1 | 83 | | char currentChar = parameters.rowContent[parameters.position]; |
| | 1 | 84 | | if (currentChar == flags.escapeCharacter) |
| | | 85 | | { |
| | 1 | 86 | | HandleEscapeChar(parameters); |
| | | 87 | | } |
| | 1 | 88 | | else if (IsQuote(parameters)) |
| | | 89 | | { |
| | 0 | 90 | | HandleQuoteChar(parameters); |
| | | 91 | | } |
| | 1 | 92 | | else if (IsDelimiter(parameters)) |
| | | 93 | | { |
| | 1 | 94 | | HandleDelimiter(parameters); |
| | | 95 | | } |
| | | 96 | | else |
| | | 97 | | { |
| | 1 | 98 | | currentCellValue.Append(currentChar); |
| | 1 | 99 | | parameters.position++; |
| | | 100 | | } |
| | 1 | 101 | | return false; |
| | | 102 | | } |
| | | 103 | | |
| | | 104 | | private void HandleDelimiter(Params parameters) |
| | | 105 | | { |
| | 1 | 106 | | if (quoted) |
| | | 107 | | { |
| | 1 | 108 | | currentCellValue.Append(GetCharAtPosition(parameters)); |
| | 1 | 109 | | parameters.position++; |
| | | 110 | | } |
| | | 111 | | else |
| | | 112 | | { |
| | 1 | 113 | | AddCurrentCell(); |
| | 1 | 114 | | currentCellValue = new StringBuilder(); |
| | 1 | 115 | | parameters.position++; |
| | | 116 | | } |
| | 1 | 117 | | } |
| | | 118 | | |
| | | 119 | | private void HandleEscapeChar(Params parameters) |
| | | 120 | | { |
| | 1 | 121 | | if (IsEscapeFollowedByQuote(parameters.rowContent, parameters.position)) |
| | | 122 | | { |
| | 1 | 123 | | currentCellValue.Append(flags.quoteCharacter); |
| | 1 | 124 | | parameters.position += 2; |
| | | 125 | | } |
| | 1 | 126 | | else if (flags.quoteCharacter != flags.escapeCharacter.ToString()) |
| | | 127 | | { |
| | 1 | 128 | | if (isPositionInBound(parameters.rowContent, parameters.position + 1)) |
| | | 129 | | { |
| | 1 | 130 | | char charToAppend = parameters.rowContent[parameters.position + 1]; |
| | 1 | 131 | | currentCellValue.Append(charToAppend); |
| | | 132 | | } |
| | 1 | 133 | | parameters.position += 2; |
| | | 134 | | |
| | | 135 | | } |
| | | 136 | | else |
| | | 137 | | { |
| | 1 | 138 | | HandleQuoteChar(parameters); |
| | | 139 | | } |
| | 1 | 140 | | } |
| | | 141 | | |
| | | 142 | | private void HandleQuoteChar(Params parameters) |
| | | 143 | | { |
| | 1 | 144 | | if (!quoted) |
| | | 145 | | { |
| | 1 | 146 | | quoted = true; |
| | | 147 | | |
| | | 148 | | |
| | 1 | 149 | | if (currentCellValue.Length > 0 && currentCellValue.ToString() != flags.quoteCharacter) |
| | | 150 | | { |
| | 0 | 151 | | if (!charOutsideQuotesAdded) |
| | | 152 | | { |
| | 0 | 153 | | _row?.errors?.Add(ErrorFactory.GetCharOutsideQuotesVE(_row.table.url, _row.sourceNumber, cellVal |
| | 0 | 154 | | charOutsideQuotesAdded = true; |
| | | 155 | | } |
| | | 156 | | } |
| | | 157 | | |
| | | 158 | | |
| | 1 | 159 | | parameters.position++; |
| | | 160 | | } |
| | | 161 | | else |
| | | 162 | | { |
| | 1 | 163 | | quoted = false; |
| | 1 | 164 | | parameters.position++; |
| | | 165 | | |
| | 1 | 166 | | if (isPositionInBound(parameters) |
| | 1 | 167 | | && !IsDelimiter(parameters) |
| | 1 | 168 | | && !IsQuote(parameters)) |
| | | 169 | | { |
| | 0 | 170 | | if (!charOutsideQuotesAdded) |
| | | 171 | | { |
| | 0 | 172 | | _row?.errors?.Add(ErrorFactory.GetCharOutsideQuotesVE(_row.table.url, _row.sourceNumber, cellVal |
| | 0 | 173 | | charOutsideQuotesAdded = true; |
| | | 174 | | } |
| | | 175 | | } |
| | | 176 | | |
| | | 177 | | |
| | | 178 | | } |
| | | 179 | | |
| | 1 | 180 | | } |
| | | 181 | | private bool IsEscapeFollowedByQuote(string rowContent, int position) |
| | | 182 | | { |
| | 1 | 183 | | string escapeFollowedByQuote = flags.escapeCharacter.ToString() + flags.quoteCharacter; |
| | 1 | 184 | | if (position >= 0 && position + escapeFollowedByQuote.Length <= rowContent.Length) |
| | | 185 | | { |
| | 1 | 186 | | string extractedSubstring = rowContent.Substring(position, escapeFollowedByQuote.Length); |
| | 1 | 187 | | return extractedSubstring.Equals(escapeFollowedByQuote); |
| | | 188 | | } |
| | | 189 | | |
| | 1 | 190 | | return false; |
| | | 191 | | } |
| | | 192 | | |
| | | 193 | | private void ConditionallyTrimCellValue() |
| | | 194 | | { |
| | 1 | 195 | | trimmedCellValue = currentCellValue.ToString(); |
| | 1 | 196 | | if (flags.trim == TrimOptions.START || flags.trim == TrimOptions.TRUE) |
| | | 197 | | { |
| | 1 | 198 | | trimmedCellValue = trimmedCellValue.TrimStart(); |
| | | 199 | | } |
| | 1 | 200 | | if (flags.trim == TrimOptions.END || flags.trim == TrimOptions.TRUE) |
| | | 201 | | { |
| | 1 | 202 | | trimmedCellValue = trimmedCellValue.TrimEnd(); |
| | | 203 | | } |
| | 1 | 204 | | } |
| | | 205 | | |
| | | 206 | | private void AddCurrentCell() |
| | | 207 | | { |
| | 1 | 208 | | charOutsideQuotesAdded = false; |
| | 1 | 209 | | ConditionallyTrimCellValue(); |
| | 1 | 210 | | cellValues.Add(trimmedCellValue); |
| | 1 | 211 | | } |
| | | 212 | | } |
| | | 213 | | } |