Replies: 4 comments
-
The dialect sniffing procedure def sniff(self) ->Union[Dialect, None]:
self.validate
#Initialize Dialects and scores
dialects = p_dialects.get_dialects(p_dialects(self.delimiter_list,self.quotechar_list))
n = len(dialects)
scores =[0]*n
#Compute scores
j = 0
for d in dialects:
t_scoring = t_score(csv_path=self.file_path,
dialect=d,
threshold=self.threshold,
encoding=self.encoding)
try:
scores[j] = t_scoring.compute()
except:
scores[j] = 0
j += 1
return get_best_dialect(scores, dialects)
def get_best_dialect(
scores_: List[float],
dialects_: List[Dialect]
)->Union[Dialect, None]:
max_score = max(scores_)
if max_score > 0:
return dialects_[scores_.index(max_score)]
else:
return None |
Beta Was this translation helpful? Give feedback.
0 replies
-
The table score procedure def compute(self) -> float:
self.validate
#Initialize table object
table_obj = table_constructor(file_path=self.csv_path, threshold=self.threshold, encoding=self.encoding)
"""
Construct a sample table from CSV. Lines starting with '#' will be skiped.
"""
sample = table_obj.fromCSV(_dialect=self.dialect)
#Compute record score by infering fields data type
detector = type_detector()
record_score = detector.record_score(data=sample, dialect=self.dialect)
#Compute Table Uniformity parameters
_uniformity = t_uniformity(table=sample)
tau = _uniformity.compute()
n = len(sample)
if n > 1:
gamma = tau[0] / self.threshold + (1 / (tau[1] + n))
else:
eta = math.sqrt(record_score) / 10
k = len(sample[0])
gamma = (eta + (1 / k)) / (k - math.floor(eta * k) + 1)
return gamma * record_score |
Beta Was this translation helpful? Give feedback.
0 replies
-
The Table Uniformity procedure def avg_fields(self) -> float:
nk = 0
for record in self.table:
nk += len(record)
return nk / len(self.table)
def compute(self) -> List[float]:
self.validate()
phi = self.avg_fields()
mu, c, sm, alpha, beta = 0, 0, 0, 0, 0
n = len(self.table) #Number of records
for i in range (0, n):
k_i = len(self.table[i]) #Number of fields in current record
mu += math.pow(k_i - phi, 2) #Deviations
if i == 0:
c += 1
k_max = k_i
k_min = k_i
if n== 1:
sm = c
else:
if k_i > k_max:
k_max = k_i
if k_i < k_min:
k_min = k_i
k_prev = len(self.table[i - 1])
if k_prev != k_i:
alpha += 1
if c > sm:
sm = c #Segmented mode
c = 0
else:
c += 1
if i == n - 1:
if c > sm:
sm = c
if n > 1:
sigma = math.sqrt(mu / (n - 1)) #Standard deviation
else:
sigma = math.sqrt(mu / n)
tau_0 = 1 / (1 + 2 * sigma) #Consistency factor
r = k_max - k_min #Range of fields
if alpha > 0:
beta = sm / n #Records variability factor
tau_1 = 2 * r * (math.pow(alpha, 2) + 1) * ((1 - beta) / sm) #Records dispersion
return [tau_0, tau_1] |
Beta Was this translation helpful? Give feedback.
0 replies
-
The record score procedure def record_score(self,
data: List[List],
dialect: Dialect
) -> float:
"""
Computes the record score as the sum of detected cells data-type.
Parameters
----------
data: List[List[str]]
the data as a List of String List
dialect: Dialect
the dialect used to read the file
Returns
-------
record_score: float
"""
try:
td = type_detector()
TotalScore = 0
for record in data:
tmpSum = 0
k = 0
for field in record:
k += 1
if td.is_known_type(trip_quotes(field, dialect)):
tmpSum += 100
else:
tmpSum += 0.1
TotalScore += math.pow(tmpSum, 2)/(100 * math.pow(k, 2))
return TotalScore
except:
pass
def trip_quotes(
field: str,
dialect: Dialect
) -> str:
if field != '':
#Care and removal of spaces at the beginning of fields
field = field.strip()
return trip_ends(field, dialect) if field != '' else field
else:
return field
def trip_ends(
field: str,
dialect: Dialect
) -> str: |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
Problem overview
Currently, this project does not have a stable alternative that allows detecting CSV file configuration. An example of this is raised in #1719, where the utility fails to detect the configuration for the given files.
Details
At the moment, @jqnatividad has begun digging into the problem and claiming
He pointed
The work path to go, until now, is outlined in jqnatividad/qsv-sniffer#14. Currently, all tasks are under study but not completed.
New path
In this I will discuss a new approach to implement dialect detection in qsv using trivial elements:
With this approach the dialect detection is reliable as the CleverCSV one, being able to obtain results with greater certainty. The process is as follows:
A Python implementation of this exact approach is described in a GitHub repository. The evaluation of this methods gives:
CSVsniffer
CleverCSV
csv.Sniffer
This sheds light over one point: the presented approach is clearly outperforming
csv.Sniffer
and alsoCleverCSV
in the research datasets.Hoping this can help this wonderful project!
Beta Was this translation helpful? Give feedback.
All reactions