| | 1 | | using System.Text.RegularExpressions; |
| | 2 | | using ValidateLib.ErrorsAndWarnings.Errors; |
| | 3 | | using ValidateLib.ErrorsAndWarnings.Errors.Specific; |
| | 4 | | using ValidateLib.ErrorsAndWarnings.Errors.ValidationErrors; |
| | 5 | | using ValidateLib.Metadata.Descriptors; |
| | 6 | | using ValidateLib.TabularData.AnnotatedTabularDataModel; |
| | 7 | | using ValidateLib.TabularData.Datatypes; |
| | 8 | |
|
| | 9 | | namespace ValidateLib.TabularData.Parsing |
| | 10 | | { |
| | 11 | | /// <summary> |
| | 12 | | /// This class should serve the purpose of parsing string value of cell to a correct data-type. Adheres to the |
| | 13 | | /// algorithm https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/#parsing-cells. |
| | 14 | | /// </summary> |
| | 15 | | public class CellParser |
| | 16 | | { |
| 1 | 17 | | static string[] CRLFTIrreplacable = new string[] |
| 1 | 18 | | { |
| 1 | 19 | | "string", |
| 1 | 20 | | "json", |
| 1 | 21 | | "xml", |
| 1 | 22 | | "html", |
| 1 | 23 | | "anyAtomicType" |
| 1 | 24 | | }; |
| | 25 | |
|
| 1 | 26 | | static string[] StringUnstripable = new string[] |
| 1 | 27 | | { |
| 1 | 28 | | "string", |
| 1 | 29 | | "json", |
| 1 | 30 | | "xml", |
| 1 | 31 | | "html", |
| 1 | 32 | | "anyAtomicType", |
| 1 | 33 | | "normalizedString" |
| 1 | 34 | | }; |
| | 35 | | /// <summary> |
| | 36 | | /// Implements whole algorithm. |
| | 37 | | /// </summary> |
| | 38 | | /// <param name="cell"></param> |
| | 39 | | /// <returns></returns> |
| | 40 | | public static List<Error> ParseCell(Cell cell) |
| | 41 | | { |
| 1 | 42 | | Column? cellsColumn = cell.column; |
| 1 | 43 | | if (cellsColumn is null) |
| 0 | 44 | | return cell.errors; |
| | 45 | |
|
| | 46 | | // steps 1 and 2 from the algorithm |
| 1 | 47 | | string normalizedStringValue = NormalizeString(cell.column!.datatype!, cell.stringValue); |
| | 48 | |
|
| | 49 | | // step 3 |
| 1 | 50 | | normalizedStringValue = normalizedStringValue == "" ? cellsColumn._default : normalizedStringValue; |
| | 51 | |
|
| | 52 | |
|
| 1 | 53 | | if (cellsColumn.separator != null) |
| | 54 | | { |
| | 55 | | // step 4 and 5 |
| 1 | 56 | | HandleSeparatorCase(cell, normalizedStringValue); |
| | 57 | | } |
| | 58 | | else |
| | 59 | | { |
| | 60 | | // step 6 7 8 9 |
| 1 | 61 | | HandleNormalCase(cell, normalizedStringValue); |
| | 62 | | } |
| | 63 | |
|
| 1 | 64 | | return cell.errors; |
| | 65 | | } |
| | 66 | |
|
| | 67 | | /// <summary> |
| | 68 | | /// Step 1 and step 2 from the algorithm |
| | 69 | | /// </summary> |
| | 70 | | /// <param name="datatype"></param> |
| | 71 | | /// <param name="stringValue"></param> |
| | 72 | | /// <returns></returns> |
| | 73 | | static string NormalizeString(DatatypeDescriptor datatype, string stringValue) |
| | 74 | | { |
| | 75 | |
|
| 1 | 76 | | if (!CRLFTIrreplacable.Contains(datatype._base!._value!)) |
| 1 | 77 | | stringValue = ReplaceCRLFT(stringValue); |
| 1 | 78 | | if (!StringUnstripable.Contains(datatype._base!._value)) |
| 1 | 79 | | stringValue = NormalizeWhitespace(stringValue); |
| | 80 | |
|
| 1 | 81 | | return stringValue; |
| | 82 | | } |
| | 83 | |
|
| | 84 | | /// <summary> |
| | 85 | | /// Step 1 from the algorithm |
| | 86 | | /// </summary> |
| | 87 | | /// <param name="stringValue"></param> |
| | 88 | | /// <returns></returns> |
| | 89 | | static string ReplaceCRLFT(string stringValue) |
| | 90 | | { |
| 1 | 91 | | stringValue = stringValue.Replace('\n', ' '); |
| 1 | 92 | | stringValue = stringValue.Replace('\r', ' '); |
| 1 | 93 | | stringValue = stringValue.Replace('\t', ' '); |
| 1 | 94 | | return stringValue; |
| | 95 | | } |
| | 96 | |
|
| | 97 | | /// <summary> |
| | 98 | | /// Step 2 from the algorithm |
| | 99 | | /// </summary> |
| | 100 | | /// <param name="input"></param> |
| | 101 | | /// <returns></returns> |
| | 102 | | static string NormalizeWhitespace(string input) |
| | 103 | | { |
| | 104 | | // Replace multiple consecutive whitespace characters with a single space |
| 1 | 105 | | string normalizedString = Regex.Replace(input, @"\s+", " "); |
| | 106 | |
|
| | 107 | | // Trim leading and trailing spaces |
| 1 | 108 | | normalizedString = normalizedString.Trim(); |
| | 109 | |
|
| 1 | 110 | | return normalizedString; |
| | 111 | | } |
| | 112 | |
|
| | 113 | | static void HandleSeparatorCase(Cell cell, string normalizedStringValue) |
| | 114 | | { |
| 1 | 115 | | Column cellsColumn = cell.column!; |
| 1 | 116 | | cell.cellType = CellType.LIST; |
| | 117 | |
|
| 1 | 118 | | if (normalizedStringValue == "") |
| | 119 | | { |
| 1 | 120 | | if (cellsColumn.required) |
| 0 | 121 | | cell.errors.Add(ErrorFactory.GetEmptyCellInRequiredColumnVE(cell)); |
| | 122 | | } |
| | 123 | | else |
| | 124 | | { |
| 1 | 125 | | if (cellsColumn._null.Contains(normalizedStringValue)) |
| | 126 | | { |
| 0 | 127 | | cell.cellType = CellType.NULL; |
| | 128 | | } |
| | 129 | |
|
| 1 | 130 | | string[] stringValues = normalizedStringValue.Split(cellsColumn.separator); |
| | 131 | |
|
| 1 | 132 | | if (cellsColumn.datatype!._base!._value != "string" && |
| 1 | 133 | | cellsColumn.datatype._base._value != "anyAtomicType") |
| 1 | 134 | | foreach (var stringVal in stringValues) |
| | 135 | | { |
| 1 | 136 | | stringVal.Trim(); |
| 1 | 137 | | HandleNormalCase(cell, stringVal, true); |
| | 138 | | } |
| | 139 | | } |
| 1 | 140 | | } |
| | 141 | | /// <summary> |
| | 142 | | /// Handles steps 6-9 from algorithm |
| | 143 | | /// </summary> |
| | 144 | | /// <param name="cell"></param> |
| | 145 | | /// <param name="normalizedString"></param> |
| | 146 | | /// <param name="addToList"> Used to parametrize the method for reuse in the step 5.4 </param> |
| | 147 | | static void HandleNormalCase(Cell cell, string normalizedString, bool addToList = false) |
| | 148 | | { |
| 1 | 149 | | Column cellsColumn = cell.column!; |
| 1 | 150 | | if (normalizedString == "") |
| 1 | 151 | | normalizedString = cellsColumn._default; |
| 1 | 152 | | if (cellsColumn._null.Contains(normalizedString)) |
| | 153 | | { |
| 1 | 154 | | if (!addToList) |
| 1 | 155 | | cell.cellType = CellType.NULL; |
| 1 | 156 | | if (cellsColumn.separator is null && cellsColumn.required) |
| 1 | 157 | | cell.errors.Add(ErrorFactory.GetEmptyCellInRequiredColumnVE(cell)); |
| | 158 | | } |
| | 159 | | else |
| | 160 | | { |
| | 161 | | try |
| | 162 | | { |
| 1 | 163 | | var datatypeDescriptor = cellsColumn.datatype; |
| 1 | 164 | | var parsedCellValue = DatatypeFactory.GetDatatype( |
| 1 | 165 | | normalizedString, |
| 1 | 166 | | datatypeDescriptor._base!._value!, |
| 1 | 167 | | datatypeDescriptor.format |
| 1 | 168 | | ); |
| 1 | 169 | | if (addToList) |
| 1 | 170 | | cell.cellValues.Add(parsedCellValue); |
| | 171 | | else |
| 1 | 172 | | cell.value = parsedCellValue; |
| | 173 | |
|
| | 174 | |
|
| 1 | 175 | | } |
| 1 | 176 | | catch (DatatypeValidationError error) |
| | 177 | | { |
| 1 | 178 | | error.Cell = cell; |
| 1 | 179 | | cell.errors.Add(error); |
| 1 | 180 | | cell.value = DatatypeFactory.GetDatatype(normalizedString, "string", null); |
| 1 | 181 | | } |
| 0 | 182 | | catch (Error error) |
| | 183 | | { |
| 0 | 184 | | cell.errors.Add(error); |
| 0 | 185 | | } |
| | 186 | | } |
| | 187 | |
|
| 1 | 188 | | } |
| | 189 | | } |
| | 190 | |
|
| | 191 | |
|
| | 192 | | } |