Skip to content

Commit 0beaf03

Browse files
committed
Implemented dtof in assembly
1 parent a9687a1 commit 0beaf03

File tree

3 files changed

+397
-20
lines changed

3 files changed

+397
-20
lines changed

src/crt/dtof.src

Lines changed: 224 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,224 @@
1-
assume adl=1
2-
3-
section .text
4-
5-
public __dtof
6-
7-
__dtof:
8-
; f64_ret_f32
9-
push af, iy, bc, de, hl
10-
call ___f64_to_f32
11-
pop af
12-
ld a, e
13-
pop de
14-
ld e, a
15-
pop bc, iy, af
16-
ret
17-
18-
extern ___f64_to_f32
1+
assume adl=1
2+
3+
section .text
4+
5+
public __dtof
6+
7+
private __dtof_helper
8+
__dtof_helper:
9+
; Moving this block of code to be behind __dtof ensures that
10+
; __dtof.ret_copysign can always be reached by jr in all paths.
11+
.overflow:
12+
; carry is set here
13+
pop hl
14+
; A = $10
15+
add a, c ; attempts to overflow the low 4 bits of the exponent
16+
rl b ; (0x7F << 1) | 1 if the input is inf/NaN
17+
inc b ; B will only be zero if the input was inf/NaN
18+
jr nz, .not_inf_nan
19+
20+
; carry is cleared
21+
adc hl, hl
22+
jr nz, .has_payload
23+
ld a, e
24+
rla
25+
and a, $3F
26+
jr z, .no_payload
27+
.has_payload:
28+
set 5, e ; ensure that NaN stays NaN
29+
.no_payload:
30+
ld a, c
31+
push de
32+
pop bc
33+
ld l, 5
34+
call __lshru
35+
push bc
36+
pop hl
37+
.finish_inf_nan:
38+
ld a, $7F
39+
jr __dtof.ret_copysign
40+
.not_inf_nan:
41+
; return infinity
42+
ld hl, $800000
43+
jr .finish_inf_nan
44+
45+
; Convert BC:UDE:UHL F64 to E:UHL F32
46+
; Rounding: round to nearest with ties to even
47+
; Behaviour:
48+
; Underflow: Returns signed zero. No signals raised.
49+
; Subnormal: No signals raised.
50+
; Rounded to Infinity: No signals raised.
51+
; Overflow: Returns signed infinity. No signals raised.
52+
; Signaling NaN: Quiet bit preserved. No signals raised.
53+
; Quiet NaN: Quiet bit preserved. No signals raised.
54+
; NaN Payloads: Copies the most significant payload bits. The LSB of mantissa is set if payload bits were discarded/truncated out.
55+
__dtof:
56+
bit 7, b
57+
push af ; preserve A and signbit
58+
push bc
59+
push de
60+
push hl
61+
; clear UBC
62+
inc bc
63+
dec.s bc
64+
res 7, b
65+
ld hl, -$3810
66+
add hl, bc
67+
jr nc, .maybe_subnormal
68+
ld hl, -$47F0 ; $FFB810
69+
ld a, l ; ld a, $10
70+
add hl, bc
71+
jr c, __dtof_helper.overflow
72+
; result is normal or rounds to infinity
73+
; calculate new exponent
74+
; we only need the low 8 bits of the exponent
75+
add hl, hl
76+
add hl, hl
77+
add hl, hl
78+
add hl, hl
79+
; offset = -$380 - -$47F = $FF = -1 ; therefore decrement
80+
dec h ; store new exponent
81+
ld l, 29 ; f64_mant_bits - f32_mant_bits = 52 - 23 = 29
82+
ex (sp), hl ; (SP) = exponent/shift, HL = lo24
83+
84+
; clear exponent
85+
dec a ; ld a, $0F
86+
and a, c
87+
ld c, a
88+
xor a, a
89+
ld b, a
90+
; test round bit
91+
bit 4, e
92+
jr z, .round_down
93+
; test guard bit
94+
ld a, e
95+
and a, $20
96+
jr nz, .round_up
97+
; test sticky bits
98+
inc a ; make A non-zero
99+
adc hl, hl
100+
jr nz, .round_up
101+
ld a, e
102+
rla
103+
and a, $1F
104+
.round_up:
105+
.round_down:
106+
call __llshru
107+
or a, a
108+
jr z, .no_round
109+
inc hl ; does not overflow
110+
.no_round:
111+
pop af ; a = exponent, flags = 29 = ---5H3V-C
112+
or a, a
113+
rra
114+
jr nc, .even_exponent
115+
ld bc, $800000
116+
add hl, bc ; the result might be rounded to infinity here
117+
adc a, c ; adc a, 0 ; wont overflow
118+
.even_exponent:
119+
.subnormal_no_round:
120+
.ret_copysign:
121+
pop de
122+
ld e, a
123+
pop bc
124+
pop af
125+
ret z
126+
set 7, e
127+
ret
128+
129+
.ret_zero:
130+
; carry is cleared
131+
pop hl
132+
xor a, a
133+
sbc hl, hl
134+
jr .ret_copysign
135+
136+
.maybe_subnormal:
137+
ld hl, -$3690
138+
add hl, bc
139+
jr nc, .ret_zero
140+
; calculate shift
141+
; A = (uint8_t)((BC - $3690) >> 4)
142+
; A = (uint8_t)((HL << 4) >> 8)
143+
add hl, hl
144+
add hl, hl
145+
add hl, hl
146+
add hl, hl
147+
ld a, h
148+
; Shift = -A + 4 + 24
149+
cpl
150+
add a, (4 + 24) + 1 ; (4 + 24) + CPL trick
151+
; maximum shift = 24 + 4 + 25 = 24 + 29 = 53
152+
; minimum shift = 24 + 4 + 1 = 24 + 5 = 29
153+
ld b, a
154+
ld e, a ; store shift amount
155+
xor a, a
156+
; calculate sticky bits
157+
ld hl, 1
158+
.shift_loop:
159+
add hl, hl
160+
rla
161+
djnz .shift_loop
162+
; carry won't be set
163+
; set C:UDE to A:UHL
164+
; shift by an additional 24 bits
165+
dec hl
166+
or a, a
167+
jr z, .the_set_bit_is_in_hl
168+
dec a
169+
.the_set_bit_is_in_hl:
170+
ld c, a
171+
ld a, e ; restore shift amount
172+
ex de, hl
173+
scf
174+
sbc hl, hl
175+
; BC:UDE:UHL = 1 << shift
176+
; (SP) = X
177+
call __lland
178+
; test if BC:UDE:UHL is zero
179+
; UBC must be zero for this to work
180+
add hl, de ; carry may be set
181+
adc hl, bc ; wont overflow
182+
pop hl
183+
; DE and BC are swapped here
184+
pop bc
185+
pop de
186+
push de
187+
push bc
188+
189+
; clear exponent and include the implicit mantissa bit
190+
ld d, 0
191+
jr z, .no_sticky_bits
192+
inc d
193+
.no_sticky_bits:
194+
195+
ld l, a ; L = shift
196+
ld a, e
197+
and a, $0F
198+
or a, $10
199+
200+
call __lshru
201+
xor a, a ; subnormal exponent
202+
; HL = BC >> 1
203+
scf
204+
sbc hl, hl ; ld hl, -1
205+
add hl, sp
206+
push bc
207+
srl (hl)
208+
pop hl
209+
rr h
210+
rr l ; round bit shifted out
211+
212+
jr nc, .subnormal_no_round
213+
dec d
214+
jr z, .subnormal_round_up
215+
bit 0, l
216+
jr z, .subnormal_no_round
217+
.subnormal_round_up:
218+
inc hl ; wont overflow, but may become FLT_MIN
219+
; .subnormal_no_round:
220+
jr .ret_copysign
221+
222+
extern __lland
223+
extern __llshru
224+
extern __lshru

test/floating_point/float64_to_float32/src/f64_to_f32_LUT.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ typedef uint64_t input_type;
99

1010
typedef uint32_t output_type;
1111

12-
static const input_type f64_to_f32_LUT_input[256] = {
12+
static const input_type f64_to_f32_LUT_input[260] = {
1313
/* 0 */ UINT64_C(0x0000000000000000),
1414
/* 1 */ UINT64_C(0x0000000000000001),
1515
/* 2 */ UINT64_C(0x0010000000000000),
@@ -266,9 +266,14 @@ static const input_type f64_to_f32_LUT_input[256] = {
266266
/* 253 */ UINT64_C(0xD22D38D57ABF3991),
267267
/* 254 */ UINT64_C(0xA86498F2933913FB),
268268
/* 255 */ UINT64_C(0x4841C1F00831E908),
269+
/* bonus edge cases */
270+
/* 256 */ UINT64_C(0x369F82B925D1BFBA),
271+
/* 257 */ UINT64_C(0xB76634D97D4F585C),
272+
/* 258 */ UINT64_C(0x36DD000000000000),
273+
/* 259 */ UINT64_C(0xB80E0000A0000000),
269274
};
270275

271-
const output_type f64_to_f32_LUT_output[256] = {
276+
const output_type f64_to_f32_LUT_output[260] = {
272277
/* 0 */ UINT32_C(0x00000000),
273278
/* 1 */ UINT32_C(0x00000000),
274279
/* 2 */ UINT32_C(0x00000000),
@@ -525,6 +530,11 @@ const output_type f64_to_f32_LUT_output[256] = {
525530
/* 253 */ UINT32_C(0xFF800000),
526531
/* 254 */ UINT32_C(0x80000000),
527532
/* 255 */ UINT32_C(0x7F800000),
533+
/* bonus edge cases */
534+
/* 256 */ UINT32_C(0x00000001),
535+
/* 257 */ UINT32_C(0x80001635),
536+
/* 258 */ UINT32_C(0x0000000E),
537+
/* 259 */ UINT32_C(0x80780002),
528538
};
529539

530540
#endif /* F64_TO_F32_LUT_H */

0 commit comments

Comments
 (0)