| | 1 | | using Newtonsoft.Json.Linq; |
| | 2 | | using ValidateLib.ErrorsAndWarnings.Warnings; |
| | 3 | | using ValidateLib.Metadata.Descriptors; |
| | 4 | | using ValidateLib.Metadata.ParsingAndValidation; |
| | 5 | | using ValidateLib.TabularData.Parsing; |
| | 6 | |
|
| | 7 | | namespace ValidateLib.Metadata.Embedded |
| | 8 | | { |
| | 9 | | /// <summary> |
| | 10 | | /// Extracts embedded metadata based on the |
| | 11 | | /// <see href="https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/#parsing">algorithm</see> |
| | 12 | | /// </summary> |
| | 13 | | public class EmbeddedMetadataExtractor : Reader |
| | 14 | | { |
| 1 | 15 | | protected int _sourceRowNumber = 1; |
| | 16 | | protected CustomStreamReader? _csr; |
| | 17 | | protected RowReader? _rowReader; |
| | 18 | | protected RowParser? _rowParser; |
| 1 | 19 | | public EmbeddedMetadataExtractor(Flags flags) : base(flags) |
| | 20 | | { |
| | 21 | |
|
| 1 | 22 | | } |
| | 23 | | /// <summary> |
| | 24 | | /// extracts embedded metadata from file stream. |
| | 25 | | /// </summary> |
| | 26 | | /// <param name="fileIRI"> normalized file path, used as URL for embedded metadata</param> |
| | 27 | | /// <param name="dialect"> dialect to be used for extraction </param> |
| | 28 | | /// <param name="fs"> file stream to extract the metadata from</param> |
| | 29 | | /// <returns></returns> |
| | 30 | | public JObject extractEmbeddedMetadata(FileStream fs, DialectDescriptor dialect, string fileIRI) |
| | 31 | | { |
| | 32 | | // point 2 in https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/#parsing |
| | 33 | | // creating default embedded metadata object |
| 1 | 34 | | string content = "{\r\n \"@context\": \"http://www.w3.org/ns/csvw\",\r\n \"rdfs:comment\": [],\r\n \"tabl |
| 1 | 35 | | JObject embeddedMetadata = JObject.Parse(content); |
| 1 | 36 | | embeddedMetadata["url"] = fileIRI; // required property, needed for table description compatibility check |
| | 37 | |
|
| | 38 | | try |
| | 39 | | { |
| 1 | 40 | | if (fs is not null) |
| | 41 | | { |
| | 42 | |
|
| 1 | 43 | | _csr = new CustomStreamReader(fs, bufferSize, maximalLineTerminatorLength); |
| 1 | 44 | | _rowReader = new RowReader(flags); |
| 1 | 45 | | _rowParser = new RowParser(flags); |
| 1 | 46 | | _sourceRowNumber = 1; // point 4 |
| 1 | 47 | | HandleEncoding(); // point 5 |
| 1 | 48 | | HandleSkipRows(embeddedMetadata); |
| 1 | 49 | | HandleHeader(embeddedMetadata); |
| | 50 | |
|
| | 51 | | } |
| 1 | 52 | | } |
| | 53 | | finally |
| | 54 | | { |
| 1 | 55 | | if (fs is not null) |
| 1 | 56 | | fs.Dispose(); |
| 1 | 57 | | } |
| | 58 | |
|
| | 59 | |
|
| 1 | 60 | | return embeddedMetadata; |
| | 61 | | } |
| | 62 | |
|
| | 63 | | /// <summary> |
| | 64 | | /// Extracts embedded table descriptor from the file. |
| | 65 | | /// </summary> |
| | 66 | | /// <param name="fs"> filestream to the file from which we extract embedded metadata </param> |
| | 67 | | /// <param name="dialect"> dialect based on what we parse tabular data file </param> |
| | 68 | | /// <param name="fileIRI"> file iri</param> |
| | 69 | | /// <returns></returns> |
| | 70 | | public TableDescriptor extractEmbeddedMetadataTableDescriptor(FileStream fs, DialectDescriptor dialect, string f |
| | 71 | | { |
| | 72 | |
|
| 1 | 73 | | JObject embeddedMetadata = extractEmbeddedMetadata(fs, dialect, fileIRI); |
| 1 | 74 | | List<Warning> warnings = new List<Warning>(); |
| | 75 | |
|
| 1 | 76 | | var tableDescriptor = MetadataParserValidator.ProcessTableFromJObject(warnings, embeddedMetadata, fileIRI); |
| 1 | 77 | | return tableDescriptor; |
| | 78 | |
|
| | 79 | | } |
| | 80 | |
|
| | 81 | | protected void HandleEncoding() |
| | 82 | | { |
| | 83 | | // TODO |
| 1 | 84 | | } |
| | 85 | |
|
| | 86 | | protected void HandleZeroHeaderRowCount(JObject embeddedMetadata, string firstRecordRowContent) |
| | 87 | | { |
| 1 | 88 | | if (flags.headerRowCount == 0) |
| | 89 | | { |
| 0 | 90 | | JArray columnsArray = (JArray)embeddedMetadata["tableSchema"]!["columns"]!; |
| 0 | 91 | | List<string> firstRecordRowColumns = _rowParser!.ParseRow(firstRecordRowContent); |
| 0 | 92 | | firstRecordRowColumns.RemoveRange(0, flags.skipColumns); |
| 0 | 93 | | for (int i = 0; i < firstRecordRowColumns.Count - flags.skipColumns; i++) |
| | 94 | | { |
| 0 | 95 | | AddNewColumn(columnsArray); |
| | 96 | | } |
| | 97 | |
|
| | 98 | | } |
| 1 | 99 | | } |
| | 100 | | // Point 6 in algorithm |
| | 101 | | protected void HandleSkipRows(JObject embeddedMetadata) |
| | 102 | | { |
| 1 | 103 | | for (int i = 0; i < flags.skipRows; i++) |
| | 104 | | { |
| 0 | 105 | | string rowContent = _rowReader!.ReadRow(_csr!); |
| 0 | 106 | | JArray comments = (JArray)embeddedMetadata["rdfs:comment"]!; |
| 0 | 107 | | if (flags.commentPrefix != null && rowContent.StartsWith(flags.commentPrefix)) |
| | 108 | | { |
| 0 | 109 | | string rowContentWithoutCommentPrefix = rowContent.Substring(flags.commentPrefix.Length); |
| 0 | 110 | | comments.Add(rowContentWithoutCommentPrefix); |
| | 111 | | } |
| | 112 | | else |
| | 113 | | { |
| 0 | 114 | | if (rowContent != "") |
| | 115 | | { |
| 0 | 116 | | comments.Add(rowContent); |
| | 117 | | } |
| | 118 | | } |
| 0 | 119 | | _sourceRowNumber++; |
| | 120 | | } |
| 1 | 121 | | } |
| | 122 | |
|
| | 123 | | // Point 7 in algorithm |
| | 124 | | protected void HandleHeader(JObject embeddedMetadata) |
| | 125 | | { |
| | 126 | |
|
| 1 | 127 | | for (int i = 0; i < flags.headerRowCount; i++) |
| | 128 | | { |
| 1 | 129 | | string rowContent = _rowReader!.ReadRow(_csr!)!; |
| 1 | 130 | | JArray comments = (JArray)embeddedMetadata["rdfs:comment"]!; |
| | 131 | | // point 7.2 |
| 1 | 132 | | if (IsComment(rowContent)) |
| | 133 | | { |
| 0 | 134 | | string rowContentWithoutCommentPrefix = rowContent.Substring(flags.commentPrefix.Length); |
| 0 | 135 | | comments.Add(rowContentWithoutCommentPrefix); |
| | 136 | | } |
| | 137 | | // point 7.3 |
| | 138 | | else |
| | 139 | | { |
| 1 | 140 | | RowParser rowParser = new RowParser(flags); |
| 1 | 141 | | List<string> cellValues = rowParser.ParseRow(rowContent); |
| 1 | 142 | | HandleColumnsFromOneRowOfHeader(cellValues, embeddedMetadata); |
| | 143 | | } |
| | 144 | | } |
| | 145 | |
|
| 1 | 146 | | _sourceRowNumber++; |
| 1 | 147 | | } |
| | 148 | | /// <summary> |
| | 149 | | /// Check whether the string is comment string. |
| | 150 | | /// </summary> |
| | 151 | | /// <param name="rowContent"> content of the row.</param> |
| | 152 | | /// <returns> true if it is comment </returns> |
| | 153 | | protected bool IsComment(string rowContent) |
| | 154 | | { |
| 1 | 155 | | return flags.commentPrefix != null && rowContent.StartsWith(flags.commentPrefix); |
| | 156 | | } |
| | 157 | |
|
| | 158 | | protected void HandleColumnsFromOneRowOfHeader(List<string> cellValues, JObject embeddedMetadata) |
| | 159 | | { |
| 1 | 160 | | RemoveSkipColumns(cellValues); |
| 1 | 161 | | JArray columnsArray = (JArray)embeddedMetadata["tableSchema"]!["columns"]!; |
| | 162 | |
|
| 1 | 163 | | for (int i = 0; i < cellValues.Count; i++) |
| | 164 | | { |
| | 165 | | // we do not want to use this anymore as we allow empty header names or all whitespace names |
| | 166 | | /* |
| | 167 | | if (cellValues[i] == "" || cellValues[i].All(char.IsWhiteSpace)) |
| | 168 | | continue; |
| | 169 | | */ |
| 1 | 170 | | if (columnsArray.Count <= i) |
| | 171 | | { |
| 1 | 172 | | AddNewColumnWithTitle(columnsArray, cellValues[i]); |
| | 173 | | } |
| | 174 | | else |
| | 175 | | { |
| 0 | 176 | | AppendNewTitleToColumnDescriptor(columnsArray, cellValues[i], i); |
| | 177 | | } |
| | 178 | | } |
| 1 | 179 | | } |
| | 180 | | /// <summary> |
| | 181 | | /// Removes number of skip columns cell values from the row cells as they |
| | 182 | | /// do no concern us. |
| | 183 | | /// </summary> |
| | 184 | | /// <param name="cellValues"> cells of a row </param> |
| 1 | 185 | | protected void RemoveSkipColumns(List<string> cellValues) => cellValues.RemoveRange(0, flags.skipColumns); |
| 0 | 186 | | protected void AddNewColumn(JArray columnsArray) => columnsArray.Add(new JObject()); |
| 1 | 187 | | protected void AddNewColumnWithTitle(JArray columnsArray, string title) => columnsArray.Add(new JObject(new JPro |
| 0 | 188 | | protected void AppendNewTitleToColumnDescriptor(JArray columnsArray, string title, int index) => ((JArray)column |
| | 189 | | } |
| | 190 | | } |