Skip to content

Commit a27e486

Browse files
authored
Fix: Columnar handling of duplicate column values (#14)
2 parents 651dafd + 0d31af1 commit a27e486

File tree

2 files changed

+308
-118
lines changed

2 files changed

+308
-118
lines changed

src/Data/Csv/Columnar.cs

Lines changed: 162 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -7,131 +7,155 @@
77

88
namespace HashFields.Data.Csv
99
{
10+
/// <summary>
11+
/// Helper to work with delimited tabular data as columns rather than rows.
12+
/// </summary>
13+
/// <see cref="IEquatable{T}" />
1014
internal class Columnar : IEquatable<Columnar>
1115
{
12-
private readonly List<string> _headers = new();
16+
private readonly List<string> _header = new();
1317
private readonly Dictionary<string, List<string>> _data = new();
18+
private readonly string _delimiter;
1419

20+
/// <summary>
21+
/// The column of values by column name.
22+
/// </summary>
23+
/// <param name="key">The name of the column.</param>
24+
/// <returns>A list representing the column's values.</returns>
1525
public List<string> this[string key] { get => _data[key]; }
16-
public List<string> this[int index] { get => _data[_headers[index]]; }
17-
public List<string> Header { get => _headers.ToList(); }
18-
public List<List<string>> Columns { get => _data.Values.ToList(); }
1926

20-
public Columnar(string delimiter) : this(new MemoryStream(), delimiter)
21-
{
22-
}
27+
/// <summary>
28+
/// The column of values by column index.
29+
/// </summary>
30+
/// <param name="index">The 0-based index of the column.</param>
31+
/// <returns>A list representing the column's values.</returns>
32+
public List<string> this[int index] { get => _data[_header[index]]; }
2333

34+
/// <summary>
35+
/// The list of column names.
36+
/// </summary>
37+
public List<string> Header { get => _header.ToList(); }
38+
39+
/// <summary>
40+
/// The list of data columns.
41+
/// </summary>
42+
public List<List<string>> Columns { get => _data.Values.ToList(); }
43+
44+
/// <summary>
45+
/// Initialize a new <c>Columnar</c> for delimited data.
46+
/// </summary>
47+
/// <param name="stream">The <c>Stream</c> of data to read into this <c>Columnar</c>.</param>
48+
/// <param name="delimiter">The delimiter used between fields in the data.</param>
2449
public Columnar(Stream stream, string delimiter)
2550
{
2651
if (stream is not null)
2752
{
28-
var tuple = Parse(stream, delimiter);
53+
_delimiter = delimiter;
54+
55+
var tuple = Parse(stream, _delimiter);
2956

30-
_headers = tuple.Item1;
57+
_header = tuple.Item1;
3158
_data = tuple.Item2;
3259
}
3360
}
3461

62+
/// <summary>
63+
/// Call a function for each value in the specified columns.
64+
/// </summary>
65+
/// <param name="func">
66+
/// A function taking a string as input and returning a string.
67+
/// Each value in the column is passed through this function and
68+
/// overwritten in-place.
69+
/// </param>
70+
/// <param name="columns">The list of columns to apply the function on.</param>
3571
public void Apply(Func<string, string> func, params string[] columns)
3672
{
37-
foreach (var column in _headers.Intersect(columns).ToArray())
73+
foreach (var column in _header.Intersect(columns).ToArray())
3874
{
3975
_data[column] = _data[column].ConvertAll(s => func(s));
4076
}
4177
}
4278

43-
public bool Equals(Columnar other)
44-
{
45-
if (other is null)
46-
{
47-
return false;
48-
}
49-
50-
if (!_headers.SequenceEqual(other._headers))
51-
{
52-
return false;
53-
}
54-
55-
foreach (var column in _data)
56-
{
57-
if (!column.Value.SequenceEqual(other._data[column.Key]))
58-
{
59-
return false;
60-
}
61-
}
62-
63-
return true;
64-
}
65-
66-
public override bool Equals(object obj)
67-
{
68-
if (obj is null)
69-
{
70-
return false;
71-
}
72-
73-
if (obj is not Columnar columnar)
74-
{
75-
return false;
76-
}
77-
78-
return Equals(columnar);
79-
}
80-
81-
public override int GetHashCode()
82-
{
83-
var hashcode = new HashCode();
84-
foreach (var header in _headers)
85-
{
86-
hashcode.Add(header);
87-
}
88-
foreach (var column in _data.Values)
89-
{
90-
foreach (var val in column)
91-
{
92-
hashcode.Add(val);
93-
}
94-
}
95-
return hashcode.ToHashCode();
96-
}
97-
79+
/// <summary>
80+
/// Remove the named columns from this <c>Columnar</c> data.
81+
/// The column names should match those found in the <c>Header</c>.
82+
/// </summary>
83+
/// <seealso cref="Header" />
84+
/// <param name="columns">The list of column names to remove.</param>
9885
public void Remove(params string[] columns)
9986
{
100-
foreach (var column in _headers.Intersect(columns).ToArray())
87+
// find intersection of the real header names and those for removal
88+
// create a new array from this intersection so we don't loop over
89+
// the collection we are modifying!
90+
foreach (var column in _header.Intersect(columns).ToArray())
10191
{
102-
_headers.Remove(column);
92+
_header.Remove(column);
10393
_data.Remove(column);
10494
}
10595
}
10696

107-
public void Write(Stream destination)
108-
{
109-
using var sw = new StreamWriter(destination);
110-
foreach (var row in Rows())
111-
{
112-
sw.WriteLine(String.Join(",", row));
113-
}
114-
}
115-
116-
private List<List<string>> Rows()
97+
/// <summary>
98+
/// Compute the list of data rows from the current state of this <c>Columnar</c>.
99+
/// </summary>
100+
public List<List<string>> Rows()
117101
{
102+
// find the column with the longest length (N) - the number of rows
103+
// create a list of N lists to represent the rows
118104
var rows = Enumerable.Range(0, Columns.Max(c => c.Count))
119105
.Select(_ => new List<string>())
120106
.ToList();
121107

122108
foreach (var column in Columns)
123109
{
124-
foreach (var val in column)
110+
// copy values for this column into each row
111+
for (int i = 0; i < column.Count; i++)
125112
{
126-
rows[column.IndexOf(val)].Add(val);
113+
// rows[i] is a list representing the ith row
114+
// append the column value to the end of the row list
115+
// the "next" position in the row
116+
rows[i].Add(column[i]);
127117
}
128118
}
129119

120+
// insert the header row first
130121
rows.Insert(0, Header);
131122

132123
return rows;
133124
}
134125

126+
/// <summary>
127+
/// Write this <c>Columnar</c> data to a stream as delimited tabular data.
128+
/// </summary>
129+
/// <param name="destination">A writable <c>Steam</c> target for this <c>Columnar</c>.</param>
130+
public void Write(Stream destination)
131+
{
132+
using var sw = new StreamWriter(destination);
133+
foreach (var row in Rows())
134+
{
135+
sw.WriteLine(String.Join(_delimiter, row));
136+
}
137+
}
138+
139+
/// <summary>
140+
/// Read delimited data from a stream and convert into columnar format.
141+
/// </summary>
142+
/// <param name="stream">The source of data.</param>
143+
/// <param name="delimiter">The delimiter used to separate fields in the data.</param>
144+
/// <returns>A <c>Tuple</c> containing two items:
145+
/// <list type="bullet">
146+
/// <item>
147+
/// <term><c>List{String}</c></term>
148+
/// <description>The ordered header row of column names.</description>
149+
/// </item>
150+
/// <item>
151+
/// <term><c>Dictionary{String,List{String}}</c></term>
152+
/// <description>
153+
/// The data columns, where the key is the column name
154+
/// and the value is the list of values in the column.
155+
/// </description>
156+
/// </item>
157+
/// </list>
158+
/// </returns>
135159
private static Tuple<List<string>, Dictionary<string, List<string>>> Parse(Stream stream, string delimiter)
136160
{
137161
var header = new List<string>();
@@ -170,5 +194,64 @@ private static Tuple<List<string>, Dictionary<string, List<string>>> Parse(Strea
170194
)
171195
);
172196
}
197+
198+
#region IEquatable<Columnar>
199+
200+
public bool Equals(Columnar other)
201+
{
202+
if (other is null)
203+
{
204+
return false;
205+
}
206+
207+
if (!_header.SequenceEqual(other._header))
208+
{
209+
return false;
210+
}
211+
212+
foreach (var column in _data)
213+
{
214+
if (!column.Value.SequenceEqual(other._data[column.Key]))
215+
{
216+
return false;
217+
}
218+
}
219+
220+
return true;
221+
}
222+
223+
public override bool Equals(object obj)
224+
{
225+
if (obj is null)
226+
{
227+
return false;
228+
}
229+
230+
if (obj is not Columnar columnar)
231+
{
232+
return false;
233+
}
234+
235+
return Equals(columnar);
236+
}
237+
238+
public override int GetHashCode()
239+
{
240+
var hashcode = new HashCode();
241+
foreach (var header in _header)
242+
{
243+
hashcode.Add(header);
244+
}
245+
foreach (var column in _data.Values)
246+
{
247+
foreach (var val in column)
248+
{
249+
hashcode.Add(val);
250+
}
251+
}
252+
return hashcode.ToHashCode();
253+
}
254+
255+
#endregion
173256
}
174257
}

0 commit comments

Comments
 (0)