From 0beaf03fe299c03445b23c2e2f52907305ce42ec Mon Sep 17 00:00:00 2001
From: unknown <71151164+ZERICO2005@users.noreply.github.com>
Date: Tue, 8 Jul 2025 00:00:02 -0600
Subject: [PATCH] Implemented dtof in assembly

---
 src/crt/dtof.src                              | 242 ++++++++++++++++--
 .../float64_to_float32/src/f64_to_f32_LUT.h   |  14 +-
 .../float64_to_float32/src/main.c             | 161 ++++++++++++
 3 files changed, 397 insertions(+), 20 deletions(-)

diff --git a/src/crt/dtof.src b/src/crt/dtof.src
index 1dac34205..53e663032 100644
--- a/src/crt/dtof.src
+++ b/src/crt/dtof.src
@@ -1,18 +1,224 @@
-	assume	adl=1
-
-	section	.text
-
-	public	__dtof
-
-__dtof:
-	; f64_ret_f32
-	push	af, iy, bc, de, hl
-	call	___f64_to_f32
-	pop	af
-	ld	a, e
-	pop	de
-	ld	e, a
-	pop	bc, iy, af
-	ret
-
-	extern	___f64_to_f32
+	assume	adl=1
+
+	section	.text
+
+	public	__dtof
+
+	private	__dtof_helper
+__dtof_helper:
+	; Moving this block of code to be behind __dtof ensures that
+	; __dtof.ret_copysign can always be reached by jr in all paths.
+.overflow:
+	; carry is set here
+	pop	hl
+	; A = $10
+	add	a, c	; attempts to overflow the low 4 bits of the exponent
+	rl	b	; (0x7F << 1) | 1 if the input is inf/NaN
+	inc	b	; B will only be zero if the input was inf/NaN
+	jr	nz, .not_inf_nan
+
+	; carry is cleared
+	adc	hl, hl
+	jr	nz, .has_payload	
+	ld	a, e
+	rla
+	and	a, $3F
+	jr	z, .no_payload
+.has_payload:
+	set	5, e	; ensure that NaN stays NaN
+.no_payload:
+	ld	a, c
+	push	de
+	pop	bc
+	ld	l, 5
+	call	__lshru
+	push	bc
+	pop	hl
+.finish_inf_nan:
+	ld	a, $7F
+	jr	__dtof.ret_copysign
+.not_inf_nan:
+	; return infinity
+	ld	hl, $800000
+	jr	.finish_inf_nan
+
+; Convert BC:UDE:UHL F64 to E:UHL F32
+; Rounding: round to nearest with ties to even
+; Behaviour:
+;  Underflow: Returns signed zero. No signals raised.
+;  Subnormal: No signals raised.
+;  Rounded to Infinity: No signals raised.
+;  Overflow: Returns signed infinity. No signals raised.
+;  Signaling NaN: Quiet bit preserved. No signals raised.
+;  Quiet NaN: Quiet bit preserved. No signals raised.
+;  NaN Payloads: Copies the most significant payload bits. The LSB of mantissa is set if payload bits were discarded/truncated out.
+__dtof:
+	bit	7, b
+	push	af	; preserve A and signbit
+	push	bc
+	push	de
+	push	hl
+	; clear UBC
+	inc	bc
+	dec.s	bc
+	res	7, b
+	ld	hl, -$3810
+	add	hl, bc
+	jr	nc, .maybe_subnormal
+	ld	hl, -$47F0	; $FFB810
+	ld	a, l		; ld a, $10
+	add	hl, bc
+	jr	c, __dtof_helper.overflow
+	; result is normal or rounds to infinity
+	; calculate new exponent
+	; we only need the low 8 bits of the exponent
+	add	hl, hl
+	add	hl, hl
+	add	hl, hl
+	add	hl, hl
+	; offset = -$380 - -$47F = $FF = -1 ; therefore decrement
+	dec	h		; store new exponent
+	ld	l, 29		; f64_mant_bits - f32_mant_bits = 52 - 23 = 29
+	ex	(sp), hl	; (SP) = exponent/shift, HL = lo24
+
+	; clear exponent
+	dec	a	; ld	a, $0F
+	and	a, c
+	ld	c, a
+	xor	a, a
+	ld	b, a
+	; test round bit
+	bit	4, e
+	jr	z, .round_down
+	; test guard bit
+	ld	a, e
+	and	a, $20
+	jr	nz, .round_up
+	; test sticky bits
+	inc	a	; make A non-zero
+	adc	hl, hl
+	jr	nz, .round_up
+	ld	a, e
+	rla
+	and	a, $1F
+.round_up:
+.round_down:
+	call	__llshru
+	or	a, a
+	jr	z, .no_round
+	inc	hl	; does not overflow
+.no_round:
+	pop	af	; a = exponent, flags = 29 = ---5H3V-C
+	or	a, a
+	rra
+	jr	nc, .even_exponent
+	ld	bc, $800000
+	add	hl, bc	; the result might be rounded to infinity here
+	adc	a, c	; adc a, 0 ; wont overflow
+.even_exponent:
+.subnormal_no_round:
+.ret_copysign:
+	pop	de
+	ld	e, a
+	pop	bc
+	pop	af
+	ret	z
+	set	7, e
+	ret
+
+.ret_zero:
+	; carry is cleared
+	pop	hl
+	xor	a, a
+	sbc	hl, hl
+	jr	.ret_copysign
+
+.maybe_subnormal:
+	ld	hl, -$3690
+	add	hl, bc
+	jr	nc, .ret_zero
+	; calculate shift
+	; A = (uint8_t)((BC - $3690) >> 4)
+	; A = (uint8_t)((HL << 4) >> 8)
+	add	hl, hl
+	add	hl, hl
+	add	hl, hl
+	add	hl, hl
+	ld	a, h
+	; Shift = -A + 4 + 24
+	cpl
+	add	a, (4 + 24) + 1	; (4 + 24) + CPL trick
+	; maximum shift = 24 + 4 + 25 = 24 + 29 = 53
+	; minimum shift = 24 + 4 + 1 = 24 + 5 = 29
+	ld	b, a
+	ld	e, a	; store shift amount
+	xor	a, a
+	; calculate sticky bits
+	ld	hl, 1
+.shift_loop:
+	add	hl, hl
+	rla
+	djnz	.shift_loop
+	; carry won't be set
+	; set C:UDE to A:UHL
+	; shift by an additional 24 bits
+	dec	hl
+	or	a, a
+	jr	z, .the_set_bit_is_in_hl
+	dec	a
+.the_set_bit_is_in_hl:
+	ld	c, a
+	ld	a, e	; restore shift amount
+	ex	de, hl
+	scf
+	sbc	hl, hl
+	; BC:UDE:UHL = 1 << shift
+	; (SP) = X
+	call	__lland
+	; test if BC:UDE:UHL is zero
+	; UBC must be zero for this to work
+	add	hl, de	; carry may be set
+	adc	hl, bc	; wont overflow
+	pop	hl
+	; DE and BC are swapped here
+	pop	bc
+	pop	de
+	push	de
+	push	bc
+
+	; clear exponent and include the implicit mantissa bit
+	ld	d, 0
+	jr	z, .no_sticky_bits
+	inc	d
+.no_sticky_bits:
+
+	ld	l, a	; L = shift
+	ld	a, e
+	and	a, $0F
+	or	a, $10
+
+	call	__lshru
+	xor	a, a	; subnormal exponent
+	; HL = BC >> 1
+	scf
+	sbc	hl, hl	; ld hl, -1
+	add	hl, sp
+	push	bc
+	srl	(hl)
+	pop	hl
+	rr	h
+	rr	l	; round bit shifted out
+
+	jr	nc, .subnormal_no_round
+	dec	d
+	jr	z, .subnormal_round_up
+	bit	0, l
+	jr	z, .subnormal_no_round
+.subnormal_round_up:
+	inc	hl	; wont overflow, but may become FLT_MIN
+; .subnormal_no_round:
+	jr	.ret_copysign
+
+	extern	__lland
+	extern	__llshru
+	extern	__lshru
diff --git a/test/floating_point/float64_to_float32/src/f64_to_f32_LUT.h b/test/floating_point/float64_to_float32/src/f64_to_f32_LUT.h
index 904062339..5935844cd 100644
--- a/test/floating_point/float64_to_float32/src/f64_to_f32_LUT.h
+++ b/test/floating_point/float64_to_float32/src/f64_to_f32_LUT.h
@@ -9,7 +9,7 @@ typedef uint64_t input_type;
 
 typedef uint32_t output_type;
 
-static const input_type f64_to_f32_LUT_input[256] = {
+static const input_type f64_to_f32_LUT_input[260] = {
 /*    0 */ UINT64_C(0x0000000000000000),
 /*    1 */ UINT64_C(0x0000000000000001),
 /*    2 */ UINT64_C(0x0010000000000000),
@@ -266,9 +266,14 @@ static const input_type f64_to_f32_LUT_input[256] = {
 /*  253 */ UINT64_C(0xD22D38D57ABF3991),
 /*  254 */ UINT64_C(0xA86498F2933913FB),
 /*  255 */ UINT64_C(0x4841C1F00831E908),
+/* bonus edge cases */
+/*  256 */ UINT64_C(0x369F82B925D1BFBA),
+/*  257 */ UINT64_C(0xB76634D97D4F585C),
+/*  258 */ UINT64_C(0x36DD000000000000),
+/*  259 */ UINT64_C(0xB80E0000A0000000),
 };
 
-const output_type f64_to_f32_LUT_output[256] = {
+const output_type f64_to_f32_LUT_output[260] = {
 /*    0 */ UINT32_C(0x00000000),
 /*    1 */ UINT32_C(0x00000000),
 /*    2 */ UINT32_C(0x00000000),
@@ -525,6 +530,11 @@ const output_type f64_to_f32_LUT_output[256] = {
 /*  253 */ UINT32_C(0xFF800000),
 /*  254 */ UINT32_C(0x80000000),
 /*  255 */ UINT32_C(0x7F800000),
+/* bonus edge cases */
+/*  256 */ UINT32_C(0x00000001),
+/*  257 */ UINT32_C(0x80001635),
+/*  258 */ UINT32_C(0x0000000E),
+/*  259 */ UINT32_C(0x80780002),
 };
 
 #endif /* F64_TO_F32_LUT_H */
diff --git a/test/floating_point/float64_to_float32/src/main.c b/test/floating_point/float64_to_float32/src/main.c
index 4cf893fd2..085c7a609 100644
--- a/test/floating_point/float64_to_float32/src/main.c
+++ b/test/floating_point/float64_to_float32/src/main.c
@@ -18,6 +18,165 @@ typedef union F32_pun {
     uint32_t bin;
 } F32_pun;
 
+typedef union F64_pun {
+    long double flt;
+    uint64_t bin;
+} F64_pun;
+
+/* debugging code */
+#if 0
+
+void print_failed(long double input, uint32_t guess, uint32_t truth) {
+    F64_pun value;
+    value.flt = input;
+    printf(
+        "I: %016llX -->\nG: %08lX != %08lX\n",
+        value.bin, guess, truth
+    );
+}
+
+typedef union Float32_Bitwise {
+    float flt_part;
+    uint32_t u32_part;
+} Float32_Bitwise;
+
+typedef union Float64_Bitwise {
+    long double flt_part;
+    uint64_t u64_part;
+} Float64_Bitwise;
+
+uint32_t _dtof_c(uint64_t x) {
+    uint64_t mant = x & ((UINT64_C(1) << 52) - 1);
+    const uint16_t biased = (x >> 52) & 0x7FF;
+    const uint16_t BC = (uint16_t)((x >> 48) & 0x7FFF);
+    // 879 == 0x369
+    if (BC < 16 * (1023 - (126 + 24))) {
+        return 0;
+    }
+    // 897 == 0x381
+    if (BC < 16 * (1023 - 126)) {
+        uint64_t norm = mant | (UINT64_C(1) << 52);
+        uint8_t sub_expon = -(uint8_t)((BC - 16 * (1023 - 126)) >> 4);
+        const bool sticky = norm & ((UINT64_C(1) << (sub_expon + (4 + 24))) - 1);
+        norm >>= sub_expon + (4 + 24);
+        const bool round = norm & 1;
+        const bool guard = norm & 2;
+        norm >>= 1;
+
+        uint32_t ret = (uint32_t)norm;
+        if (round && (guard || sticky)) {
+            // round up to even
+            ret++;
+        }
+        return ret;
+    }
+    // 1151 == 0x47F
+    if (biased >= 128 + 1023) {
+        #if 1
+        if (biased == 0x7FF) {
+            uint32_t ret = (uint32_t)(mant >> (5 + 24)) & UINT32_C(0x007FFFFF);
+            #if 1
+                /* Preserve NaN */
+                bool low_nan = mant & ((UINT32_C(1) << (5 + 24)) - 1);
+                if (low_nan) {
+                    ret |= 1;
+                }
+            #else
+                /* Signal NaN */
+                ret |= (UINT32_C(1) << 22);
+            #endif
+            ret |= UINT32_C(0x7F800000);
+            return ret;
+        }
+        #endif
+        return UINT32_C(0x7F800000);
+    }
+    // result is normal or rounds to infinity
+    const bool sticky = mant & ((UINT64_C(1) << (4 + 24)) - 1);
+    const bool round = mant & (UINT64_C(1) << (4 + 24));
+    const bool guard = mant & (UINT64_C(1) << (5 + 24));
+    mant >>= (5 + 24);
+    uint32_t ret = (uint32_t)mant;
+    // -896 == -0x380
+    uint16_t new_expon = biased + (uint16_t)(127 - 1023);
+    ret |= (uint32_t)new_expon << 23;
+    // we round afterwards incase we round up to the next exponent
+    if (round && (guard || sticky)) {
+        // round up to even
+        ret++;
+    }
+    return ret;
+}
+
+float dtof(long double x_flt) {
+    Float64_Bitwise x;
+    Float32_Bitwise y;
+    #if 1
+        x.flt_part = fabsl(x_flt);
+        y.u32_part = _dtof_c(x.u64_part);
+        if (signbit(x_flt)) {
+            y.u32_part ^= (UINT32_C(1) << 31);
+        }
+    #else
+        x.flt_part = x_flt;
+        y.u32_part = _dtof_c(x.u64_part);
+    #endif
+    return y.flt_part;
+}
+
+static uint64_t rand_u64(void) {
+    union {
+        uint64_t u64;
+        uint16_t u16[4];
+    } split;
+    #if 1
+        split.u16[0] = (uint16_t)rand();
+        split.u16[1] = (uint16_t)rand();
+        split.u16[2] = (uint16_t)rand();
+    #else
+        split.u16[0] = (uint16_t)rand() & 0x0001;
+        split.u16[1] = (uint16_t)rand() & 0xFF00;
+        split.u16[2] = (uint16_t)rand() & 0x8000;
+    #endif
+    #if 0
+        split.u16[3] = (uint16_t)rand() | 0x3E60;
+    #elif 0
+        split.u16[3] = ((unsigned)rand() % (0x4820 - 0x3660)) + 0x3660;
+    #else
+        split.u16[3] = ((unsigned)rand() % (0x3840 - 0x3660)) + 0x3660;
+    #endif
+    
+    return split.u64;
+}
+
+void brute_force_test(void) {
+    fputs("\n", stdout);
+    srand(0x7184CE);
+    for (size_t i = 0; i < 65536; i++) {
+        F64_pun x;
+        F32_pun t;
+        F32_pun g;
+        x.bin = rand_u64();
+        g.flt = (float)x.flt;
+        t.flt = dtof(x.flt);
+        if (g.bin != t.bin) {
+            print_failed(x.flt, g.bin, t.bin);
+            if (!(isnan(g.flt) && isnan(t.flt))) {
+                return;
+            }
+        }
+    }
+    fputs("All Good\n", stdout);
+}
+
+#else
+
+#define print_failed(...)
+
+#define brute_force_test()
+
+#endif
+
 size_t run_test(void) {
     typedef long double input_t;
     typedef F32_pun output_t;
@@ -31,6 +190,7 @@ size_t run_test(void) {
         result.flt = (float)input[i];
         if (result.bin != output[i].bin) {
             if (!(isnan(result.flt) && isnan(output[i].flt))) {
+                print_failed(input[i], result.bin, output[i].bin);
                 return i;
             }
         }
@@ -50,6 +210,7 @@ int main(void) {
         boot_sprintf(buf, "Failed test: %u\n", fail_index);
         fputs(buf, stdout);
     }
+    brute_force_test();
 
     while (!os_GetCSC());