Skip to content
This repository was archived by the owner on Feb 28, 2025. It is now read-only.

Commit bb72107

Browse files
committed
edit.c: Make line break detection much more conservative.
Detection now requires consistent line breaks being present within sampling buffer. If breaks of different types are found, or there's a slightest suspicion that it may be binary content, it reverts to binary-safe "as-is" mode. Also, refactor detection routines to facilitate unit testing. Signed-off-by: Paul Sokolovsky <pfalcon@users.sourceforge.net>
1 parent 2f0972b commit bb72107

File tree

1 file changed

+54
-10
lines changed

1 file changed

+54
-10
lines changed

src/editor/edit.c

Lines changed: 54 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -387,10 +387,62 @@ check_file_access (WEdit * edit, const vfs_path_t * filename_vpath, struct stat
387387
*/
388388
/* --------------------------------------------------------------------------------------------- */
389389

390+
static LineBreaks
391+
detect_lb_type_buf (unsigned char *p, ssize_t sz)
392+
{
393+
LineBreaks detected_lb = LB_ASIS;
394+
395+
/* If there was error or file too short, give up */
396+
if (sz <= 2)
397+
return LB_ASIS;
398+
399+
p[(size_t) sz] = '\0';
400+
/* Avoid ambiguity of our buffer breaking CR LF sequence */
401+
if (p[sz - 1] == '\r') {
402+
p[--sz] = '\0';
403+
}
404+
405+
for (; sz--; p++) {
406+
LineBreaks new_lb = LB_ASIS;
407+
if (*p == '\r') {
408+
if (p[1] == '\n') {
409+
sz--; p++;
410+
new_lb = LB_WIN;
411+
} else {
412+
new_lb = LB_MAC;
413+
}
414+
} else if (*p == '\n') {
415+
/* LF CR is anomaly for text file, give up */
416+
if (p[1] == '\r')
417+
return LB_ASIS;
418+
new_lb = LB_UNIX;
419+
} else if (*p < 0x20 && *p != '\t' && *p != '\f') {
420+
/* The only common special char in text files is tab, much
421+
less commonly - form feed. Anything else - give up. */
422+
return LB_ASIS;
423+
}
424+
425+
/* If we detected a new lb, and it doesn't match previously
426+
detected, give up */
427+
if (new_lb != LB_ASIS) {
428+
if (detected_lb != LB_ASIS && detected_lb != new_lb) {
429+
return LB_ASIS;
430+
}
431+
detected_lb = new_lb;
432+
}
433+
}
434+
435+
/* LB_UNIX means that within buffer, we saw only LF breaks, but
436+
we cannot be sure about entire file. So, go conservative route
437+
and don't report to user in UI that this file has unix line
438+
breaks. */
439+
return detected_lb == LB_UNIX ? LB_ASIS : detected_lb;
440+
}
441+
390442
static LineBreaks
391443
detect_lb_type (const vfs_path_t *filename_vpath)
392444
{
393-
char buf[BUF_MEDIUM];
445+
unsigned char buf[BUF_LARGE];
394446
ssize_t file, sz;
395447

396448
file = mc_open (filename_vpath, O_RDONLY | O_BINARY);
@@ -400,15 +452,7 @@ detect_lb_type (const vfs_path_t *filename_vpath)
400452
sz = mc_read (file, buf, sizeof (buf) - 1);
401453
mc_close (file);
402454

403-
if (sz <= 0)
404-
return LB_ASIS;
405-
406-
buf[(size_t) sz] = '\0';
407-
if (strstr (buf, "\r\n") != NULL)
408-
return LB_WIN;
409-
if (strchr (buf, '\r') != NULL)
410-
return LB_MAC;
411-
return LB_ASIS;
455+
return detect_lb_type_buf (buf, sz);
412456
}
413457

414458
/* --------------------------------------------------------------------------------------------- */

0 commit comments

Comments
 (0)