-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Float16Utils.pas
2876 lines (2355 loc) · 102 KB
/
Float16Utils.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{-------------------------------------------------------------------------------
This Source Code Form is subject to the terms of the Mozilla Public
License, v. 2.0. If a copy of the MPL was not distributed with this
file, You can obtain one at http://mozilla.org/MPL/2.0/.
-------------------------------------------------------------------------------}
{===============================================================================
Float16Utils
Main purpose of this library is to provide routines for conversion from and
to half precision (16bit) floating point numbers (Single -> Half, Half ->
Single).
It also provides functions for basic arithmetic and comparison, as well as
overloaded operators when compiled using FPC. But note that these functions
only converts arguments given as halfs into single-precision (32bit) floats
and operates on them.
F16C instruction extension (for x86(-64) CPUs) is used when symbol
AllowF16CExtension is defined, PurePascal is not defined, and when (and
only when) it is supported by the CPU and OS.
Implemented Half conforms to IEEE 754-2008, meaning it has one sign bit
(value is negative when sign bit is set, positive otherwise), 5 bits of
biased exponent (exponent bias is 15) and 11 bit mantissa (10 bits
explicitly stored, highest bit is assumed to be zero for denormal numbers
and zero, one otherwise)
NOTE - type Half is declared in unit AuxTypes, not here.
Version 1.1.5 (2024-04-14)
Last change 2024-04-28
©2017-2024 František Milt
Contacts:
František Milt: frantisek.milt@gmail.com
Support:
If you find this code useful, please consider supporting its author(s) by
making a small donation using the following link(s):
https://www.paypal.me/FMilt
Changelog:
For detailed changelog and history please refer to this git repository:
github.com/TheLazyTomcat/Lib.Float16
Dependencies:
* AuxExceptions - github.com/TheLazyTomcat/Lib.AuxExceptions
AuxTypes - github.com/TheLazyTomcat/Lib.AuxTypes
BasicUIM - github.com/TheLazyTomcat/Lib.BasicUIM
* SimpleCPUID - github.com/TheLazyTomcat/Lib.SimpleCPUID
Library AuxExceptions is required only when rebasing local exception classes
(see symbol Float16Utils_UseAuxExceptions for details).
SimpleCPUID is required only when AllowF16CExtension symbol is defined and
PurePascal symbol is not defined.
Libraries AuxExceptions and SimpleCPUID might also be required as an indirect
dependencies.
Indirect dependencies:
StrRect - github.com/TheLazyTomcat/Lib.StrRect
UInt64Utils - github.com/TheLazyTomcat/Lib.UInt64Utils
WinFileInfo - github.com/TheLazyTomcat/Lib.WinFileInfo
===============================================================================}
unit Float16Utils;
{
Float16Utils_PurePascal
If you want to compile this unit without ASM, don't want to or cannot define
PurePascal for the entire project and at the same time you don't want to or
cannot make changes to this unit, define this symbol for the entire project
and this unit will be compiled in PurePascal mode.
}
{$IFDEF Float16Utils_PurePascal}
{$DEFINE PurePascal}
{$ENDIF}
{
Float16Utils_UseAuxExceptions
If you want library-specific exceptions to be based on more advanced classes
provided by AuxExceptions library instead of basic Exception class, and don't
want to or cannot change code in this unit, you can define global symbol
Float16Utils_UseAuxExceptions to achieve this.
}
{$IF Defined(Float16Utils_UseAuxExceptions)}
{$DEFINE UseAuxExceptions}
{$IFEND}
//------------------------------------------------------------------------------
{$IF defined(CPUX86_64) or defined(CPUX64)}
{$DEFINE x64}
{$ELSEIF defined(CPU386)}
{$DEFINE x86}
{$ELSE}
{$DEFINE PurePascal}
{$IFEND}
{$IFDEF ENDIAN_BIG}
{$MESSAGE FATAL 'Big-endian architecture not supported'}
{$ENDIF}
{$IFDEF FPC}
{$MODE ObjFPC}{$MODESWITCH CLASSICPROCVARS+}
{$INLINE ON}
{$DEFINE CanInline}
{$IFNDEF PurePascal}
{$ASMMODE Intel}
{$DEFINE ASMSuppressSizeWarnings}
{$ENDIF}
{$DEFINE FPC_DisableWarns}
{$MACRO ON}
{$ELSE}
{$IF CompilerVersion >= 17} // Delphi 2005+
{$DEFINE CanInline}
{$ELSE}
{$UNDEF CanInline}
{$IFEND}
{$ENDIF}
{$H+}
//------------------------------------------------------------------------------
{
AllowF16CExtension
When defined, allows the use of F16C extension in ASM. The extension is used
only when both CPU and OS supports it, otherwise pascal implementation is
called instead.
Has no meaning when PurePascal symbol is defined.
Defined by default.
To disable/undefine this symbol in a project without changing this library,
define project-wide symbol Float16Utils_AllowF16CExtension_Off.
}
{$DEFINE AllowF16CExtension}
{$IFDEF Float16Utils_AllowF16CExtension_Off}
{$UNDEF AllowF16CExtension}
{$ENDIF}
//------------------------------------------------------------------------------
// do not touch following...
{$IF not Defined(PurePascal) and Defined(AllowF16CExtension)}
{$DEFINE F16U_ASM_IMPL}
{$IFEND}
interface
uses
SysUtils,
AuxTypes {contains declaration of type Half}
{$IFDEF UseAuxExceptions}, AuxExceptions{$ENDIF};
{-------------------------------------------------------------------------------
Some predefined Half values and other useful constants
-------------------------------------------------------------------------------}
const
Infinity: Half = ($00,$7C); // positive infinity
NaN: Half = ($00,$7E); // quiet NaN
MaxHalf: Half = ($FF,$7B); // 65504
MinHalf: Half = ($01,$00); // 5.96046e-8
PlusOne: Half = ($00,$3C); // +1.0
MinusOne: Half = ($00,$BC); // -1.0
One: Half = ($00,$3C); // +1.0
Zero: Half = ($00,$00); // (+)0
FLOAT16_EXPONENTBIAS = 15;
FLOAT32_EXPONENTBIAS = 127;
{===============================================================================
Library-specific exceptions - declaration
===============================================================================}
type
EF16UException = class({$IFDEF UseAuxExceptions}EAEGeneralException{$ELSE}Exception{$ENDIF});
EF16UInvalidFlag = class(EF16UException);
EF16UUnknownFunction = class(EF16UException);
EF16UNoImplementation = class(EF16UException);
{-------------------------------------------------------------------------------
Library-specific exceptions - floating-point exceptions
-------------------------------------------------------------------------------}
{
When this exception (and its descendants) is created by calling a constructor
that does not end with "NoClear", and when the MXCSR register is currently
emulated, all exception flag bits will be cleared.
When created using "NoClear" constructor, no exception flag bit is changed.
}
type
EF16UFPUException = class(EF16UException)
protected
fExceptionFlags: UInt32;
Function DefaultMessage: String; virtual; abstract;
public
constructor CreateNoClear(const Msg: String{$IFNDEF FPC}; Dummy: Integer = 0{$ENDIF});
constructor Create(const Msg: String);
constructor CreateDefMsgNoClear({$IFNDEF FPC}Dummy: Integer = 0{$ENDIF});
constructor CreateDefMsg;
// ExceptionFlags holds state of exception flags before this exception was created
property ExceptionFlags: UInt32 read fExceptionFlags;
end;
{-------------------------------------------------------------------------------
Library-specific exceptions - individual floating-point exception classes
-------------------------------------------------------------------------------}
type
EF16UInvalidOp = class(EF16UFPUException) // invalid operation/operand
protected
Function DefaultMessage: String; override;
end;
EF16UDenormal = class(EF16UFPUException)
protected
Function DefaultMessage: String; override;
end;
EF16UDivByZero = class(EF16UFPUException)
protected
Function DefaultMessage: String; override;
end;
EF16UOverflow = class(EF16UFPUException)
protected
Function DefaultMessage: String; override;
end;
EF16UUnderflow = class(EF16UFPUException)
protected
Function DefaultMessage: String; override;
end;
EF16UPrecision = class(EF16UFPUException)
protected
Function DefaultMessage: String; override;
end;
{-------------------------------------------------------------------------------
================================================================================
Auxiliary routines
================================================================================
-------------------------------------------------------------------------------}
{===============================================================================
Auxiliary routines - declaration
===============================================================================}
{-------------------------------------------------------------------------------
Auxiliary routines - SSE status and control register (MXCSR) access
-------------------------------------------------------------------------------}
// some constants for MXCSR
const
MXCSR_EFLAG_InvalidOP = UInt32($00000001);
MXCSR_EFLAG_Denormal = UInt32($00000002);
MXCSR_EFLAG_DivByZero = UInt32($00000004);
MXCSR_EFLAG_Overflow = UInt32($00000008);
MXCSR_EFLAG_Underflow = UInt32($00000010);
MXCSR_EFLAG_Precision = UInt32($00000020);
MXCSR_EMASK_InvalidOP = UInt32($00000080);
MXCSR_EMASK_Denormal = UInt32($00000100);
MXCSR_EMASK_DivByZero = UInt32($00000200);
MXCSR_EMASK_Overflow = UInt32($00000400);
MXCSR_EMASK_Underflow = UInt32($00000800);
MXCSR_EMASK_Precision = UInt32($00001000);
MXCSR_DenormalsAreZeros = UInt32($00000040);
MXCSR_FlushToZero = UInt32($00008000);
MXCSR_Rounding = UInt32($00006000); // bits 13..14
MXCSR_SHIFT_Rounding = 13;
{-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
Low-level access
-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --}
{
GetMXCSR
Returns current value of MXCSR register.
}
Function GetMXCSR: UInt32;
{
SetMXCSR
Sets MXCSR register to a passed value.
}
procedure SetMXCSR(NewValue: UInt32);
{
EmulatedMXCSR
Returns false when a real MXCSR register is used, true when operating on an
emulated local implementation.
}
Function EmulatedMXCSR: Boolean;
{
Sets MXCSR register to $00001900 - denormal, underflow and precision
exceptions are masked (others are unmasked), rounding is set to nearest,
DAZ and FTZ bits are cleared.
Call this routine only when MXCSR is NOT emulated (ie. a real CPU register is
used) and the program is compiled so that SSE is not used as a primary mean
of floating point arithmetics and/or is not automatically initialized (if the
MXCSR equals to $00001F80 - a default value - you can safely assume it was
not properly initialized).
WARNING - the initialization must be done in each execution thread.
}
procedure InitMXCSR;{$IFDEF CanInline} inline;{$ENDIF}
{
GetMXCSRMask
Returns a bitmask used when reading and writing the MXCSR register. Zeroes
are marking reserved bits, ones are marking used bits.
This value is only informative, the masking is done automatically in calls to
functions GetMXCSR and SetMXCSR.
}
Function GetMXCSRMask: UInt32;
{
GetMXCSRSupportsDAZ
Returns true when DAZ bit, and therefore denormals-are-zeros mode, is
supported by the used implementation of MXCSR (be it true SSE register or
an emulation). False when not supported.
}
Function GetMXCSRSupportsDAZ: Boolean;
{-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
Abstracted access
-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --}
type
TSSERoundingMode = (rmNearest,rmDown,rmUp,rmTruncate);
TSSEException = (excInvalidOp,excDenormal,excDivByZero,excOverflow,
excUnderflow,excPrecision);
TSSEExceptions = set of TSSEException;
TSSEFlag = (flDenormalsAreZeros,flFlushToZero);
TSSEFlags = set of TSSEFlag;
const
AllSSEExceptions = [excInvalidOp,excDenormal,excDivByZero,excOverflow,
excUnderflow,excPrecision];
//------------------------------------------------------------------------------
{
GetSSERoundingMode
Returns current value of rounding mode from MXCSR.
}
Function GetSSERoundingMode: TSSERoundingMode;
{
SetSSERoundingMode
Sets rounding mode to a selected NewValue and returns previous value of
rounding mode.
}
Function SetSSERoundingMode(NewValue: TSSERoundingMode): TSSERoundingMode;
//------------------------------------------------------------------------------
{
GetSSEExceptionMask
Returns current value of selected exception mask bit.
}
Function GetSSEExceptionMask(SSEException: TSSEException): Boolean;
{
SetSSEExceptionMask
Sets value of selected exception mask bit in MXCSR to a NewValue and returns
previous value of this bit.
When the bit is set (true), the selected exception will be masked and not
raised on its occurence.
When clear (false), the exception is unmasked and can be raised.
}
Function SetSSEExceptionMask(SSEException: TSSEException; NewValue: Boolean): Boolean;
//------------------------------------------------------------------------------
{
GetSSEExceptionMasks
Returns status of all exception mask bits in MXCSR. When the bit is set, the
exception is included in the result, when it is clear, the exception is
excluded from the result.
}
Function GetSSEExceptionMasks: TSSEExceptions;
{
SetSSEExceptionMasks
Sets new value of all exception mask bits in MXCSR. If an exception is
included in the NewValue, the mask bit will be set, when it is not included,
the mask bit will be cleared.
Returns previous state of all exception mask bits.
}
Function SetSSEExceptionMasks(NewValue: TSSEExceptions): TSSEExceptions;
//------------------------------------------------------------------------------
{
GetSSEExceptionFlag
Returns current value of selected exception flag bit.
}
Function GetSSEExceptionFlag(SSEException: TSSEException): Boolean;
{
SetSSEExceptionFlag
Sets value of selected exception flag bit in MXCSR to a NewValue and returns
previous value of this bit.
}
Function SetSSEExceptionFlag(SSEException: TSSEException; NewValue: Boolean): Boolean;
//------------------------------------------------------------------------------
{
GetSSEExceptionFlags
Returns status of all exception flag bits in MXCSR. When the bit is set,
the exception is included in the result, when it is clear, the exception is
excluded from the result.
}
Function GetSSEExceptionFlags: TSSEExceptions;
{
SetSSEExceptionFlags
Sets new value of all exception flag bits in MXCSR. If an exception is
included in the NewValue, the flag bit will be set, when it is not included,
the flag bit will be cleared.
Returns previous state of all exception flag bits.
}
Function SetSSEExceptionFlags(NewValue: TSSEExceptions): TSSEExceptions;
//------------------------------------------------------------------------------
{
GetSSEFlag
Returns current value of selected flag bit.
}
Function GetSSEFlag(Flag: TSSEFlag): Boolean;
{
SetSSEFlag
Sets value of selected flag bit in MXCSR to a NewValue and returns previous
value of this bit.
}
Function SetSSEFlag(Flag: TSSEFlag; NewValue: Boolean): Boolean;
//------------------------------------------------------------------------------
{
GetSSEFlags
Returns status of all flag bits in MXCSR. When the bit is set, the flag is
included in the result, when it is clear, the flag is excluded from the
result.
}
Function GetSSEFlags: TSSEFlags;
{
SetSSEFlags
Sets new value of all flag bits in MXCSR. If a flag is included in the
NewValue, the bit will be set, when it is not included, the bit will be
cleared.
Returns previous state of all flag bits.
}
procedure SetSSEFlags(NewValue: TSSEFlags);
//------------------------------------------------------------------------------
{
ClearSSEExceptions
Clears (sets to 0) lower 6 bits of MXCSR - that is, all exception flag bits.
}
procedure ClearSSEExceptions;{$IF Defined(CanInline) and not Defined(FPC)} inline;{$IFEND}
{
RaiseSSEExceptions(MXCSR)
Raises first encountered exception according to flags set in the passed MXCSR.
Parameter Mask controls whether to honor exception masking (true) or not
(false) when raising an exception (when honored, the masked exceptions are
NOT raised, when not honored, all exceptions can be raised, even those
masked).
Mask bits are taken from the parameter MXCSR, not from the actual register.
The exception flag bits are traversed one by one and, when a set bit is
encountered, it is cleared and a corresponding exception is raised (if
allowed by masking - see parameter Mask).
Only one exception is raised in each call, even when multiple bits are set.
The order in which the bits are traversed, and therefore the order of
exception raising is:
InvalidOP
Denormal
DivByZero
Underflow
Overflow
Precision
}
procedure RaiseSSEExceptions(var MXCSR: UInt32; Mask: Boolean = True); overload;
{
RaiseSSEExceptions
Calls the first overload with an input being current value of MXCSR (be it
real register or emulation).
Note that MXCSR register is NOT affected by this function.
}
procedure RaiseSSEExceptions(Mask: Boolean = True); overload;{$IFDEF CanInline} inline;{$ENDIF}
{-------------------------------------------------------------------------------
Auxiliary routines - conversion functions
-------------------------------------------------------------------------------}
{
MapFloat16ToWord
MapHalfToWord
Directly maps type half (float16) to a 16bit unsigned integer - no convesion
is done.
}
Function MapFloat16ToWord(Value: Float16): UInt16;
Function MapHalfToWord(Value: Half): UInt16;{$IFDEF CanInline} inline;{$ENDIF}
{
MapWordToFloat16
MapWordToHalf
Directly maps 16bit unsigned integer to type half (float16) - no convesion
is done.
}
Function MapWordToFloat16(Value: UInt16): Float16;
Function MapWordToHalf(Value: UInt16): Half;{$IFDEF CanInline} inline;{$ENDIF}
//------------------------------------------------------------------------------
procedure Float16ToFloat32(Float16Ptr,Float32Ptr: Pointer); overload;
procedure HalfToSingle(HalfPtr,SinglePtr: Pointer); overload;
procedure Float32ToFloat16(Float32Ptr,Float16Ptr: Pointer); overload;
procedure SingleToHalf(SinglePtr,HalfPtr: Pointer); overload;
Function Float16ToFloat32(Value: Float16): Float32; overload;
Function HalfToSingle(Value: Half): Single; overload;
Function Float32ToFloat16(Value: Float32): Float16; overload;
Function SingleToHalf(Value: Single): Half; overload;
//------------------------------------------------------------------------------
{
Following functions are expecting pointers to packed vector of four singles
(SinglePtr, Float32Ptr) and packed vector of four halfs (HalfPtr, Float16Ptr).
}
procedure Float16ToFloat32Vec4(Float16Ptr,Float32Ptr: Pointer);
procedure HalfToSingleVec4(HalfPtr,SinglePtr: Pointer);
procedure Float32ToFloat16Vec4(Float32Ptr,Float16Ptr: Pointer);
procedure SingleToHalfVec4(SinglePtr,HalfPtr: Pointer);
{-------------------------------------------------------------------------------
================================================================================
Number information
================================================================================
-------------------------------------------------------------------------------}
{===============================================================================
Number information - declaration
===============================================================================}
{-------------------------------------------------------------------------------
Number information - number class
-------------------------------------------------------------------------------}
Function IsZero(const Value: Half): Boolean;
Function IsDenormal(const Value: Half): Boolean;
Function IsNaN(const Value: Half): Boolean;
Function IsInfinite(const Value: Half): Boolean;
Function IsNormal(const Value: Half): Boolean; // returns false on zero
{-------------------------------------------------------------------------------
Number information - sign-related
-------------------------------------------------------------------------------}
type
TValueSign = -1..1;
Function Sign(const Value: Half): TValueSign;
Function Abs(const Value: Half): Half;
Function Neg(const Value: Half): Half;
{-------------------------------------------------------------------------------
================================================================================
Comparison functions
================================================================================
-------------------------------------------------------------------------------}
{===============================================================================
Comparison functions - declaration
===============================================================================}
{-------------------------------------------------------------------------------
Comparison functions - basic comparison
-------------------------------------------------------------------------------}
Function IsEqual(const A,B: Half): Boolean;{$IFDEF CanInline} inline;{$ENDIF}
Function IsLess(const A,B: Half): Boolean;{$IFDEF CanInline} inline;{$ENDIF}
Function IsGreater(const A,B: Half): Boolean;{$IFDEF CanInline} inline;{$ENDIF}
Function IsLessOrEqual(const A,B: Half): Boolean;{$IFDEF CanInline} inline;{$ENDIF}
Function IsGreaterOrEqual(const A,B: Half): Boolean;{$IFDEF CanInline} inline;{$ENDIF}
{-------------------------------------------------------------------------------
Comparison functions - ordered comparison
-------------------------------------------------------------------------------}
type
TValueRelationship = -1..1; // to preven problems (because delphi vs. FPC)
Function CompareValue(const A,B: Half; Epsilon: Half): TValueRelationship;{$IF Defined(CanInline) and not Defined(FPC)} inline;{$IFEND} overload;
Function CompareValue(const A,B: Half): TValueRelationship;{$IF Defined(CanInline) and not Defined(FPC)} inline;{$IFEND} overload;
Function SameValue(const A,B: Half; Epsilon: Half): Boolean;{$IF Defined(CanInline) and not Defined(FPC)} inline;{$IFEND} overload;
Function SameValue(const A,B: Half): Boolean;{$IF Defined(CanInline) and not Defined(FPC)} inline;{$IFEND} overload;
{-------------------------------------------------------------------------------
================================================================================
Arithmetic functions
================================================================================
-------------------------------------------------------------------------------}
{===============================================================================
Arithmetic functions - declaration
===============================================================================}
{-------------------------------------------------------------------------------
Arithmetic functions - basic arithmetic
-------------------------------------------------------------------------------}
Function Add(const A,B: Half): Half;{$IFDEF CanInline} inline;{$ENDIF}
Function Subtract(const A,B: Half): Half;{$IFDEF CanInline} inline;{$ENDIF}
Function Multiply(const A,B: Half): Half;{$IFDEF CanInline} inline;{$ENDIF}
Function Divide(const A,B: Half): Half;{$IFDEF CanInline} inline;{$ENDIF}
{-------------------------------------------------------------------------------
================================================================================
Floats encode/decode
================================================================================
-------------------------------------------------------------------------------}
{===============================================================================
Floats encode/decode - declaration
===============================================================================}
procedure MapToFloat16Buffer(out Buffer; Value: UInt16);
Function MapToFloat16(Value: UInt16): Float16;{$IFDEF CanInline} inline;{$ENDIF}
Function MapToHalf(Value: UInt16): Half;{$IFDEF CanInline} inline;{$ENDIF}
Function MapFromFloat16Buffer(const Buffer): UInt16;
Function MapFromFloat16(const Value: Float16): UInt16;{$IFDEF CanInline} inline;{$ENDIF}
Function MapFromHalf(const Value: Half): UInt16;{$IFDEF CanInline} inline;{$ENDIF}
//------------------------------------------------------------------------------
{
EncodeFloat16Buffer
EncodeFloat16
EncodeHalf
When BiasedExp is true, it indicates that the passed exponent is already
biased and will be stored as is. When false, the passed exponent will be
biased before storing.
NOTE - the valid range for exponent is -15..+16 when biased, 0..31
when unbiased. The exponent is clamped (limited to a prescribed
range) before biasing and storing.
Integer bit, when passed in the mantissa, is ignored - it is implied for
half-precision float.
NOTE - only lowest 10 bits of the mantissa are used, other bits gets
masked-out before storage.
}
procedure EncodeFloat16Buffer(out Buffer; Mantissa: UInt16; Exponent: Int8; Sign: Boolean; BiasedExp: Boolean = False);
Function EncodeFloat16(Mantissa: UInt16; Exponent: Int8; Sign: Boolean; BiasedExp: Boolean = False): Float16;{$IFDEF CanInline} inline;{$ENDIF}
Function EncodeHalf(Mantissa: UInt16; Exponent: Int8; Sign: Boolean; BiasedExp: Boolean = False): Half;{$IFDEF CanInline} inline;{$ENDIF}
{
DecodeFloat16Buffer
DecodeFloat16
DecodeHalf
When BiasedExp is set to true, the returned exponent is exponent as it is
stored in the value, that is, biased. When false, the returned exponent is
unbiased (its true value).
NOTE - returned exponent will be within range of -15..+16 when biased,
0..31 when unbiased.
When IntBit is set to true, the returned mantissa contains the integer bit
(bit 10) inferred from the number class (0 for denormals and zero,
1 otherwise). When false, the integer bit is masked-out and is zero,
irrespective of actual value.
NOTE - only lowest 10 (11 with integer bit) bits of the mantissa are valid,
other bits will always be zero.
}
procedure DecodeFloat16Buffer(const Buffer; out Mantissa: UInt16; out Exponent: Int8; out Sign: Boolean; BiasedExp: Boolean = False; IntBit: Boolean = True);
procedure DecodeFloat16(const Value: Float16; out Mantissa: UInt16; out Exponent: Int8; out Sign: Boolean; BiasedExp: Boolean = False; IntBit: Boolean = True);{$IFDEF CanInline} inline;{$ENDIF}
procedure DecodeHalf(const Value: Half; out Mantissa: UInt16; out Exponent: Int8; out Sign: Boolean; BiasedExp: Boolean = False; IntBit: Boolean = True);{$IFDEF CanInline} inline;{$ENDIF}
//------------------------------------------------------------------------------
procedure MapToFloat32Buffer(out Buffer; Value: UInt32);
Function MapToFloat32(Value: UInt32): Float32;{$IFDEF CanInline} inline;{$ENDIF}
Function MapToSingle(Value: UInt32): Single;{$IFDEF CanInline} inline;{$ENDIF}
Function MapFromFloat32Buffer(const Buffer): UInt32;
Function MapFromFloat32(const Value: Float32): UInt32;{$IFDEF CanInline} inline;{$ENDIF}
Function MapFromSingle(const Value: Single): UInt32;{$IFDEF CanInline} inline;{$ENDIF}
//------------------------------------------------------------------------------
{
EncodeFloat32Buffer
EncodeFloat32
EncodeSingle
When BiasedExp is true, it indicates that the passed exponent is already
biased and will be stored as is. When false, the passed exponent will be
biased before storing.
NOTE - the valid range for exponent is -127..+128 when biased, 0..255
when unbiased. The exponent is clamped (limited to a prescribed
range) before biasing and storing.
Integer bit, when passed in the mantissa, is ignored - it is implied for
single-precision float.
NOTE - only lowest 23 bits of the mantissa are used, other bits gets
masked-out before storage.
}
procedure EncodeFloat32Buffer(out Buffer; Mantissa: UInt32; Exponent: Int16; Sign: Boolean; BiasedExp: Boolean = False);
Function EncodeFloat32(Mantissa: UInt32; Exponent: Int16; Sign: Boolean; BiasedExp: Boolean = False): Float32;{$IFDEF CanInline} inline;{$ENDIF}
Function EncodeSingle(Mantissa: UInt32; Exponent: Int16; Sign: Boolean; BiasedExp: Boolean = False): Single;{$IFDEF CanInline} inline;{$ENDIF}
{
DecodeFloat32Buffer
DecodeFloat32
DecodeSingle
When BiasedExp is set to true, the returned exponent is exponent as it is
stored in the value, that is, biased. When false, the returned exponent is
unbiased (its true value).
NOTE - returned exponent will be within range of -127..+128 when biased,
0..255 when unbiased.
When IntBit is set to true, the returned mantissa contains the integer bit
(bit 23) inferred from the number class (0 for denormals and zero,
1 otherwise). When false, the integer bit is masked-out and is zero,
irrespective of actual value.
NOTE - only lowest 23 (24 with integer bit) bits of the mantissa are valid,
other bits will always be zero.
}
procedure DecodeFloat32Buffer(const Buffer; out Mantissa: UInt32; out Exponent: Int16; out Sign: Boolean; BiasedExp: Boolean = False; IntBit: Boolean = True);
procedure DecodeFloat32(const Value: Float32; out Mantissa: UInt32; out Exponent: Int16; out Sign: Boolean; BiasedExp: Boolean = False; IntBit: Boolean = True);{$IFDEF CanInline} inline;{$ENDIF}
procedure DecodeSingle(const Value: Single; out Mantissa: UInt32; out Exponent: Int16; out Sign: Boolean; BiasedExp: Boolean = False; IntBit: Boolean = True);{$IFDEF CanInline} inline;{$ENDIF}
{$IFDEF FPC}
{-------------------------------------------------------------------------------
================================================================================
Operators overloading
================================================================================
-------------------------------------------------------------------------------}
{===============================================================================
Operators overloading - declaration
===============================================================================}
{
Operators overloading is currently implemented only for FPC.
}
// assignment operators
operator := (Value: Half): Single;{$IFDEF CanInline} inline;{$ENDIF}
operator := (Value: Single): Half;{$IFDEF CanInline} inline;{$ENDIF}
// explicit assignment operators
operator explicit (Value: Half): Single;{$IFDEF CanInline} inline;{$ENDIF}
operator explicit (Value: Single): Half;{$IFDEF CanInline} inline;{$ENDIF}
// comparison operators
operator = (A,B: Half): Boolean;{$IFDEF CanInline} inline;{$ENDIF}
operator > (A,B: Half): Boolean;{$IFDEF CanInline} inline;{$ENDIF}
operator < (A,B: Half): Boolean;{$IFDEF CanInline} inline;{$ENDIF}
operator >= (A,B: Half): Boolean;{$IFDEF CanInline} inline;{$ENDIF}
operator <= (A,B: Half): Boolean;{$IFDEF CanInline} inline;{$ENDIF}
operator <> (A,B: Half): Boolean;{$IFDEF CanInline} inline;{$ENDIF}
// unary operators
operator + (A: Half): Half;{$IFDEF CanInline} inline;{$ENDIF}
operator - (A: Half): Half;{$IFDEF CanInline} inline;{$ENDIF}
// arithmetic operators
operator + (A,B: Half): Half;{$IFDEF CanInline} inline;{$ENDIF}
operator - (A,B: Half): Half;{$IFDEF CanInline} inline;{$ENDIF}
operator * (A,B: Half): Half;{$IFDEF CanInline} inline;{$ENDIF}
operator / (A,B: Half): Half;{$IFDEF CanInline} inline;{$ENDIF}
{$ENDIF}
{-------------------------------------------------------------------------------
================================================================================
Unit implementation management
================================================================================
-------------------------------------------------------------------------------}
{
WARNING - be wery careful when changing the selected implementation, as there
is absolutely no thread-safety protection.
For full description of this section, please refer to the same section in
BitOps library (github.com/TheLazyTomcat/Lib.BitOps), file BitOps.pas.
}
type
TUIM_Float16Utils_Function = (fnGetMXCSR,fnSetMXCSR,
fnHalfToSingle,fnSingleToHalf,
fnHalfToSingle4x,fnSingleToHalf4x);
TUIM_Float16Utils_Implementation = (imNone,imPascal,imAssembly);
TUIM_Float16Utils_Implementations = set of TUIM_Float16Utils_Implementation;
//------------------------------------------------------------------------------
{
Returns which implementations are available for the selected function.
}
Function UIM_Float16Utils_AvailableFuncImpl(Func: TUIM_Float16Utils_Function): TUIM_Float16Utils_Implementations;
{
Returns which implementations are supported and can be safely selected for
a given function.
}
Function UIM_Float16Utils_SupportedFuncImpl(Func: TUIM_Float16Utils_Function): TUIM_Float16Utils_Implementations;
{
Returns value indicating what implementation of the selected function is
executed when calling the function.
}
Function UIM_Float16Utils_GetFuncImpl(Func: TUIM_Float16Utils_Function): TUIM_Float16Utils_Implementation;
{
Routes selected function to a selected implementation.
Returned value is the previous routing.
NOTE - when routing GetMXCSR or SetMXCSR (fnGetMXCSR, fnSetMXCSR), both
functions are set to the same implementation - sanity protection,
so they do not operate on different domains
NOTE - when asm implementation cannot be used, and you still select it,
the function will be routed to pascal version
WARNING - when selecting imNone as an implementation for some function, the
routing is set to nil, and because the routing mechanism, for the
sake of speed, does not check validity, it will result in an
exception when calling this function
WANRING - when selecting unsupported implementation, calling the function
will almost certainly result in an system exception (invalid
instruction).
}
Function UIM_Float16Utils_SetFuncImpl(Func: TUIM_Float16Utils_Function; NewImpl: TUIM_Float16Utils_Implementation): TUIM_Float16Utils_Implementation;
implementation
uses
{$IF Defined(AllowF16CExtension) and not Defined(PurePascal)}
SimpleCPUID,
{$IFEND}
BasicUIM,
Math;
{$IFDEF FPC_DisableWarns}
{$DEFINE FPCDWM}
{$DEFINE W4055:={$WARN 4055 OFF}} // Conversion between ordinals and pointers is not portable
{$DEFINE W5024:={$WARN 5024 OFF}} // Parameter "$1" not used
{$ENDIF}
{-------------------------------------------------------------------------------
Internal constants
-------------------------------------------------------------------------------}
const
F16_MASK_SIGN = UInt16($8000); // sign bit
F16_MASK_EXP = UInt16($7C00); // exponent
F16_MASK_FRAC = UInt16($03FF); // fraction/mantissa
F16_MASK_NSGN = UInt16($7FFF); // non-sign bits
F16_MASK_FHB = UInt16($0200); // highest bit of the mantissa
F16_MASK_INTB = UInt16($0400); // otherwise implicit integer bit of the mantissa
F32_MASK_SIGN = UInt32($80000000);
F32_MASK_EXP = UInt32($7F800000);
F32_MASK_FRAC = UInt32($007FFFFF);
{$IFNDEF FPC} // not used anywhere
F32_MASK_NSGN = UInt32($7FFFFFFF);
{$ENDIF}
F32_MASK_FHB = UInt32($00400000);
F32_MASK_INTB = UInt32($00800000);
F32_MASK_REMB = UInt32($00001FFF); // 13 bits removed from single mantissa when converting to half mantissa
{===============================================================================
Library-specific exceptions - implementation
===============================================================================}
{-------------------------------------------------------------------------------
Library-specific exceptions - floating-point exceptions
-------------------------------------------------------------------------------}
constructor EF16UFPUException.CreateNoClear(const Msg: String{$IFNDEF FPC}; Dummy: Integer{$ENDIF});
begin
inherited Create(Msg);
fExceptionFlags := GetMXCSR and $3F;
end;
//------------------------------------------------------------------------------
constructor EF16UFPUException.Create(const Msg: String);
begin
CreateNoClear(Msg);
// these exceptions should not change e-flags in true MXCSR when it is not used
If EmulatedMXCSR then
ClearSSEExceptions;
end;
//------------------------------------------------------------------------------
constructor EF16UFPUException.CreateDefMsgNoClear({$IFNDEF FPC}Dummy: Integer{$ENDIF});
begin
CreateNoClear(DefaultMessage);
end;
//------------------------------------------------------------------------------
constructor EF16UFPUException.CreateDefMsg;
begin
Create(DefaultMessage);
end;
{-------------------------------------------------------------------------------
Library-specific exceptions - individual floating-point exception classes
-------------------------------------------------------------------------------}
Function EF16UInvalidOp.DefaultMessage: String;
begin
Result := 'Invalid floating point operand';
end;
//==============================================================================
Function EF16UDenormal.DefaultMessage: String;
begin
Result := 'Denormal floating point operand';
end;
//==============================================================================
Function EF16UDivByZero.DefaultMessage: String;
begin
Result := 'Floating point division by zero';
end;
//==============================================================================
Function EF16UOverflow.DefaultMessage: String;
begin
Result := 'Floating point arithmetic overflow';
end;
//==============================================================================
Function EF16UUnderflow.DefaultMessage: String;
begin
Result := 'Floating point arithmetic underflow';
end;