diff options
Diffstat (limited to 'ext/ipp/sources/include/ia_32e.inc')
-rw-r--r-- | ext/ipp/sources/include/ia_32e.inc | 4651 |
1 files changed, 4651 insertions, 0 deletions
diff --git a/ext/ipp/sources/include/ia_32e.inc b/ext/ipp/sources/include/ia_32e.inc new file mode 100644 index 0000000..b6e2e14 --- /dev/null +++ b/ext/ipp/sources/include/ia_32e.inc @@ -0,0 +1,4651 @@ +;=============================================================================== +; Copyright 2015-2018 Intel Corporation +; All Rights Reserved. +; +; If this software was obtained under the Intel Simplified Software License, +; the following terms apply: +; +; The source code, information and material ("Material") contained herein is +; owned by Intel Corporation or its suppliers or licensors, and title to such +; Material remains with Intel Corporation or its suppliers or licensors. The +; Material contains proprietary information of Intel or its suppliers and +; licensors. The Material is protected by worldwide copyright laws and treaty +; provisions. No part of the Material may be used, copied, reproduced, +; modified, published, uploaded, posted, transmitted, distributed or disclosed +; in any way without Intel's prior express written permission. No license under +; any patent, copyright or other intellectual property rights in the Material +; is granted to or conferred upon you, either expressly, by implication, +; inducement, estoppel or otherwise. Any license under such intellectual +; property rights must be express and approved by Intel in writing. +; +; Unless otherwise agreed by Intel in writing, you may not remove or alter this +; notice or any other notice embedded in Materials by Intel or Intel's +; suppliers or licensors in any way. +; +; +; If this software was obtained under the Apache License, Version 2.0 (the +; "License"), the following terms apply: +; +; You may not use this file except in compliance with the License. You may +; obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +; +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; +; See the License for the specific language governing permissions and +; limitations under the License. +;=============================================================================== + +; Last version 04.12.04 for ML64 Version 8.00.41111 (PNI+x87+MMX(TM) technology supported) +; +; Bug for PMULUDQ fixed with MACRO substitution +; +; The latest version from 25.04.07: Kobby' mni macro substituted with IPP +; realization (because of erroneous REX byte for addressing with sib byte, +; high gpr set and scaling==1 - for instance [r8+r9]) also SNI support +; added. +; +; 26.01.2009 - USES_XMM_AVX & REST_XMM_AVX added - 'v' prefix instructions +; are used now for save/restore XMM and YMM registers + automatic "vzeroupper" +; in REST_XMM_AVX macro +; +; 14.12.2009 - FMA macro added for AVX2.0 (ml10.0 support only) +; 10.02.2012 - the "f" declared local in USES_GPR, RES_GPR and IFSAME_XMM macros to avoid possible conflict +; 01.06.2012 - AVX2 vpsllvd/vq variable shifts added FMA macro fixed for ymm10-15 support +; 11.03.2013 - BDW adcx/adox added +.XLIST +;.LISTALL +;.LIST +;.LISTIF +;.LISTMACROALL + +include asmdefs.inc + +CurVer TEXTEQU @Version +IF @Version GT 900 + D_ML900 equ 1 +ELSE + ymmword equ oword +ENDIF + +IF @Version GE 1100 + ML1100 equ 1 +ENDIF + +IF @Version GE 1200 + ML1200 equ 1 +ENDIF + +IF @Version GE 1400 + ML1400 equ 1 +ENDIF + +IFNDEF LINUX32E + IFNDEF WIN32E + .ERR <Platform is not defined { LINUX32E or WIN32E }> + ECHO LINUX32E or WIN32E - Linux ABI (parameter passing in rdi, rsi, rdx, rcx, r8, r9...) + END + ENDIF +ENDIF + +IFDEF LINUX32E + IFDEF STACK_ABI + IPP_ABI = 2 + ELSE + IPP_ABI = 3 + ENDIF +ENDIF + +IFDEF WIN32E + IFDEF STACK_ABI + IPP_ABI = 1 + ELSE + IPP_ABI = 0 + ENDIF +ENDIF + +IPPASM macro x:req, y:VARARG + IFDEF _OWN_MERGED_BLD + IF _IPP32E EQ _IPP32E_PX + @CatStr(<mx_>, <x>) + ENDIF + IF _IPP32E EQ _IPP32E_M7 + @CatStr(<m7_>, <x>) + ENDIF + IF _IPP32E EQ _IPP32E_U8 + @CatStr(<u8_>, <x>) + ENDIF + IF _IPP32E EQ _IPP32E_N8 + @CatStr(<n8_>, <x>) + ENDIF + IF _IPP32E EQ _IPP32E_Y8 + @CatStr(<y8_>, <x>) + ENDIF + IF _IPP32E EQ _IPP32E_E9 + @CatStr(<e9_>, <x>) + ENDIF + IF _IPP32E EQ _IPP32E_L9 + @CatStr(<l9_>, <x>) + ENDIF + IF _IPP32E EQ _IPP32E_N0 + @CatStr(<n0_>, <x>) + ENDIF + IF _IPP32E EQ _IPP32E_K0 + @CatStr(<k0_>, <x>) + ENDIF + ELSE + @CatStr(<>, <x>) + ENDIF +endm + +DEFINED MACRO symbol:REQ + IFDEF symbol + EXITM <-1> + ELSE + EXITM <0> + ENDIF +ENDM + + +IFSAME_GPR MACRO x, f + LOCAL y + FOR y,<rbx,rbp,r12,r13,r14,r15,RBX,RBP,R12,R13,R14,R15> + IFIDN <y>,<x> + f = 1 + EXITM + ENDIF + ENDM + IF IPP_ABI LT 2 + FOR y,<rsi,RSI,rdi,RDI> + IFIDN <y>,<x> + f = 1 + EXITM + ENDIF + ENDM + ENDIF +ENDM + +GPR_CUR textequ <> +XMM_CUR textequ <> + +@ArgRev MACRO arglist:vararg + LOCAL txt, arg + txt TEXTEQU <> + FOR arg, <arglist> + txt CATSTR <arg>, <!,>, txt + ENDM + IF @SizeStr( %txt ) GT 0 + txt SUBSTR txt, 1, @SizeStr( %txt ) - 1 + ENDIF + txt CATSTR <!<>, txt, <!>> + EXITM txt +ENDM + +USES_GPR MACRO z:vararg + LOCAL y, f + LOCAL_FRAME = 0 + GPR_FRAME = 0 + GPR_CUR textequ @ArgRev( z ) + f = 0 + FOR y,<z> + IFSAME_GPR y,f + IF f GT 0 + GPR_FRAME = GPR_FRAME + 8 + push y + .PUSHREG y + f = 0 + ENDIF + ENDM +ENDM + +REST_GPR MACRO z:vararg + LOCAL u, f + f = 0 + %FOR u, GPR_CUR + IFSAME_GPR u,f + IF f GT 0 + pop u + f = 0 + ENDIF + ENDM +ENDM + +IFSAME_XMM MACRO x, isFound + LOCAL y, isFound + isFound = 0 + FOR y,<xmm6,XMM6,xmm7,XMM7,xmm8,XMM8,xmm9,XMM9,xmm10,XMM10,xmm11,XMM11,xmm12,XMM12,xmm13,XMM13,xmm14,XMM14,xmm15,XMM15> + IFIDN <y>,<x> + isFound = 1 + EXITM + ENDIF + ENDM +ENDM + +USES_XMM MACRO z:vararg + LOCAL y + XMM_CUR TEXTEQU <> + S_FRAME = 0 + LOCAL_FRAME = ( LOCAL_FRAME + 15 ) AND (-16) + IF IPP_ABI LT 2 + T_FRAME = 0 + FOR y,<z> + IFSAME_XMM y, isFound + IF isFound GT 0 + XMM_CUR CATSTR <y>, <!,>, XMM_CUR + T_FRAME = T_FRAME + 16 + ENDIF + ENDM + IF @SizeStr( %XMM_CUR ) GT 0 + XMM_CUR SUBSTR XMM_CUR, 1, @SizeStr( %XMM_CUR ) - 1 + ENDIF + XMM_CUR CATSTR <!<>, XMM_CUR, <!>> + IF (( T_FRAME GT 0 ) OR ( LOCAL_FRAME GT 0 )) + S_FRAME = T_FRAME + LOCAL_FRAME + IF (( S_FRAME + GPR_FRAME ) AND 8 ) EQ 0 + S_FRAME = S_FRAME + 8 + ENDIF + ENDIF + IF S_FRAME GT 0 + sub rsp,S_FRAME + .ALLOCSTACK S_FRAME + T_FRAME = LOCAL_FRAME + %FOR y, XMM_CUR + IFSAME_XMM y, isFound + IF isFound GT 0 + movdqa [rsp+T_FRAME],y + .SAVEXMM128 y,T_FRAME + T_FRAME = T_FRAME + 16 + ENDIF + ENDM + ENDIF + ELSE + IF IPP_ABI EQ 2 + S_FRAME = 48 + LOCAL_FRAME ;; 48 = 6 * 8 - stack frame for 6 register inputs + IF (( S_FRAME + GPR_FRAME ) AND 8 ) EQ 0 + S_FRAME = S_FRAME + 8 + ENDIF + INP_FRAME = S_FRAME - 48 ;; for Linux32s-key stack-frame for 6 registers inputs... + ELSE + IF LOCAL_FRAME GT 0 + S_FRAME = LOCAL_FRAME + IF (( S_FRAME + GPR_FRAME ) AND 8 ) EQ 0 + S_FRAME = S_FRAME + 8 + ENDIF + ENDIF + ENDIF + IF S_FRAME GT 0 + sub rsp,S_FRAME + ENDIF + ENDIF +ENDM + +REST_XMM MACRO z:vararg + LOCAL y + IF IPP_ABI LT 2 + IF S_FRAME GT 0 + T_FRAME = LOCAL_FRAME + %FOR y, XMM_CUR + movdqa y,[rsp+T_FRAME] + T_FRAME = T_FRAME + 16 + ENDM + add rsp,S_FRAME + ENDIF + ELSE + IF S_FRAME GT 0 + add rsp,S_FRAME + ENDIF + ENDIF + IF _IPP32E GE _IPP32E_E9 + IF _IPP32E NE _IPP32E_N0 + vzeroupper + ENDIF + ENDIF +ENDM + + +SAVE_XMM textequ <!<xmm6,XMM6,xmm7,XMM7,xmm8,XMM8,xmm9,XMM9,xmm10,XMM10,xmm11,XMM11,xmm12,XMM12,xmm13,XMM13,xmm14,XMM14,xmm15,XMM15!>> +SAVE_YMM textequ <!<ymm6,YMM6,ymm7,YMM7,ymm8,YMM8,ymm9,YMM9,ymm10,YMM10,ymm11,YMM11,ymm12,YMM12,ymm13,YMM13,ymm14,YMM14,ymm15,YMM15!>> + +IS_SAVEX MACRO x, f + f = 0 + %FOR yrex,SAVE_XMM ; if xmm from 6-15 range and Windows - must be saved + IFIDN <yrex>,<x> + f = 1 + EXITM + ENDIF + ENDM +ENDM + +IS_SAVEY MACRO x, f + f = 0 + %FOR yrex,SAVE_YMM ; if xmm from 6-15 range and Windows - must be saved + IFIDN <yrex>,<x> + f = 1 + EXITM + ENDIF + ENDM +ENDM + +USES_XMM_AVX MACRO z:vararg + LOCAL y, f + XMM_CUR TEXTEQU <> + S_FRAME = 0 + LOCAL_FRAME = ( LOCAL_FRAME + 15 ) AND (-16) + IF IPP_ABI LT 2 + T_FRAME = 0 + FOR y,<z> + IS_SAVEX y, f + IF f GT 0 + XMM_CUR CATSTR <y>, <!,>, XMM_CUR + T_FRAME = T_FRAME + 16 + ENDIF + ENDM + FOR y,<z> + IS_SAVEY y, f + IF f GT 0 + XMM_CUR CATSTR <y>, <!,>, XMM_CUR + T_FRAME = T_FRAME + 32 + ENDIF + ENDM + IF @SizeStr( %XMM_CUR ) GT 0 + XMM_CUR SUBSTR XMM_CUR, 1, @SizeStr( %XMM_CUR ) - 1 + ENDIF + XMM_CUR CATSTR <!<>, XMM_CUR, <!>> + IF (( T_FRAME GT 0 ) OR ( LOCAL_FRAME GT 0 )) + S_FRAME = T_FRAME + LOCAL_FRAME + IF (( S_FRAME + GPR_FRAME ) AND 8 ) EQ 0 + S_FRAME = S_FRAME + 8 + ENDIF + ENDIF + IF S_FRAME GT 0 + sub rsp,S_FRAME + .ALLOCSTACK S_FRAME + T_FRAME = LOCAL_FRAME + %FOR y, XMM_CUR + IS_SAVEX y, f + IF f GT 0 + vmovdqa oword ptr [rsp+T_FRAME],y + .SAVEXMM128 y,T_FRAME + T_FRAME = T_FRAME + 16 + ENDIF + ENDM + %FOR y, XMM_CUR + IS_SAVEY y, f + IF f GT 0 + vmovdqu ymmword ptr [rsp+T_FRAME], y + T_FRAME = T_FRAME + 32 +; vextractf128 oword ptr [rsp+T_FRAME],y,0 +; .SAVEXMM128 y,T_FRAME +; T_FRAME = T_FRAME + 16 +; vextractf128 oword ptr [rsp+T_FRAME],y,1 +; .SAVEXMM128 y,T_FRAME +; T_FRAME = T_FRAME + 16 + ENDIF + ENDM + ENDIF + ELSE + IF IPP_ABI EQ 2 + S_FRAME = 48 + LOCAL_FRAME ;; 48 = 6 * 8 - stack frame for 6 register inputs + IF (( S_FRAME + GPR_FRAME ) AND 8 ) EQ 0 + S_FRAME = S_FRAME + 8 + ENDIF + INP_FRAME = S_FRAME - 48 ;; for Linux32s-key stack-frame for 6 registers inputs... + ELSE + IF LOCAL_FRAME GT 0 + S_FRAME = LOCAL_FRAME + IF (( S_FRAME + GPR_FRAME ) AND 8 ) EQ 0 + S_FRAME = S_FRAME + 8 + ENDIF + ENDIF + ENDIF + IF S_FRAME GT 0 + sub rsp,S_FRAME + ENDIF + ENDIF +ENDM + +REST_XMM_AVX MACRO z:vararg + LOCAL y, f + IF IPP_ABI LT 2 + IF S_FRAME GT 0 + T_FRAME = LOCAL_FRAME + %FOR y, XMM_CUR + IS_SAVEX y, f + IF f GT 0 + vmovdqa y, oword ptr [rsp+T_FRAME] + T_FRAME = T_FRAME + 16 + ENDIF + ENDM + %FOR y, XMM_CUR + IS_SAVEY y, f + IF f GT 0 + vmovdqu y, ymmword ptr [rsp+T_FRAME] + T_FRAME = T_FRAME + 32 +; vinsertf128 y,y,oword ptr [rsp+T_FRAME],0 +; T_FRAME = T_FRAME + 16 +; vinsertf128 y,y,oword ptr [rsp+T_FRAME],1 +; T_FRAME = T_FRAME + 16 + ENDIF + ENDM + add rsp,S_FRAME + ENDIF + ELSE + IF S_FRAME GT 0 + add rsp,S_FRAME + ENDIF + ENDIF + IF _IPP32E NE _IPP32E_N0 + vzeroupper + ENDIF +ENDM + +COMP_ABI MACRO x + IF IPP_ABI EQ 0 ;; if defined win32e + IF x GT 0 + mov rdi,rcx ;; ARG_1 + ENDIF + IF x GT 1 + mov rsi,rdx ;; ARG_2 + ENDIF + IF x GT 2 + mov rdx,r8 ;; ARG_3 + ENDIF + IF x GT 3 + mov rcx,r9 ;; ARG_4 + ENDIF + IF x GT 4 + mov r8,[rsp+S_FRAME+GPR_FRAME+40] ;; ARG_5 + ENDIF + IF x GT 5 + mov r9,[rsp+S_FRAME+GPR_FRAME+48] ;; ARG_6 + ENDIF + IF x GT 6 + FIRST_P = S_FRAME+GPR_FRAME+56 ;; ARG_7 + ARG_7 = S_FRAME+GPR_FRAME+56 + ENDIF + ENDIF + IF IPP_ABI EQ 1 ;; if defined win32s + FIRST_P = S_FRAME+GPR_FRAME+8 + IF x GT 0 + mov [rsp+FIRST_P],rcx + ARG_1 = FIRST_P + ENDIF + IF x GT 1 + mov [rsp+FIRST_P+8],rdx + ARG_2 = ARG_1+8 + ENDIF + IF x GT 2 + mov [rsp+FIRST_P+16],r8 + ARG_3 = ARG_2+8 + ENDIF + IF x GT 3 + mov [rsp+FIRST_P+24],r9 + ARG_4 = ARG_3+8 + ENDIF + IF x GT 4 + ARG_5 = ARG_4+8 + ENDIF + IF x GT 5 + ARG_6 = ARG_5+8 + ENDIF + IF x GT 6 + ARG_7 = ARG_6+8 ;; ARG_7 + ENDIF + ENDIF + IF IPP_ABI EQ 2 ;; if defined linux32s + FIRST_P = INP_FRAME + IF x GT 0 + mov [rsp+FIRST_P],rdi + ARG_1 = FIRST_P + ENDIF + IF x GT 1 + mov [rsp+FIRST_P+8],rsi + ARG_2 = ARG_1+8 + ENDIF + IF x GT 2 + mov [rsp+FIRST_P+16],rdx + ARG_3 = ARG_2+8 + ENDIF + IF x GT 3 + mov [rsp+FIRST_P+24],rcx + ARG_4 = ARG_3+8 + ENDIF + IF x GT 4 + mov [rsp+FIRST_P+32],r8 + ARG_5 = ARG_4+8 + ENDIF + IF x GT 5 + mov [rsp+FIRST_P+40],r9 + ARG_6 = ARG_5+8 + ENDIF + IF x GT 6 + ARG_7 = S_FRAME+GPR_FRAME+8 + ENDIF + ENDIF + IF IPP_ABI EQ 3 + IF x GT 6 ;; ARG_1 = rdi ARG_2 = rsi ARG_3 = rdx ARG_4 = rcx ARG_5 = r8 ARG_6 = r9 + FIRST_P = S_FRAME+GPR_FRAME+8 ;; ARG_7 + ARG_7 = S_FRAME+GPR_FRAME+8 + ENDIF + ENDIF + IF x GT 7 + ARG_8 = ARG_7+8 ;; ARG_8 + ENDIF + IF x GT 8 + ARG_9 = ARG_8+8 ;; ARG_9 + ENDIF + IF x GT 9 + ARG_10 = ARG_9+8 ;; ARG_10 + ENDIF + IF x GT 10 + ARG_11 = ARG_10+8 ;; ARG_11 + ENDIF + IF x GT 11 + ARG_12 = ARG_11+8 ;; ARG_12 + ENDIF + IF x GT 12 + ARG_13 = ARG_12+8 ;; ARG_13 + ENDIF + IF x GT 13 + ARG_14 = ARG_13+8 ;; ARG_14 + ENDIF + IF x GT 14 + ARG_15 = ARG_14+8 ;; ARG_15 + ENDIF + IF x GT 15 + ARG_16 = ARG_15+8 ;; ARG_16 + ENDIF + IF x GT 16 + ARG_17 = ARG_16+8 ;; ARG_17 + ENDIF + IF x GT 17 + ARG_18 = ARG_17+8 ;; ARG_18 + ENDIF +; IF IPP_ABI LT 2 ;; Windows + .ENDPROLOG +; ENDIF +ENDM + +; MNI (TNI) SNI (SSE4.1) STTNI (SSE4.2) + +IF DEFINED (LINUX32E) OR DEFINED (_YASM) ; MNI macro for Linux or for Windows + + sha1rnds4 MACRO op1:req, op2:req, imm8:req + %ECHO @CatStr(<sha1rnds4 >, < op1,>, < op2,>, < imm8 >) + endm + sha1nexte MACRO op1:req, op2:req + %ECHO @CatStr(<sha1nexte >, < op1,>, < op2 >) + endm + sha1msg1 MACRO op1:req, op2:req + %ECHO @CatStr(<sha1msg1 >, < op1,>, < op2 >) + endm + sha1msg2 MACRO op1:req, op2:req + %ECHO @CatStr(<sha1msg2 >, < op1,>, < op2 >) + endm + sha256msg1 MACRO op1:req, op2:req + %ECHO @CatStr(<sha256msg1 >, < op1,>, < op2 >) + endm + sha256msg2 MACRO op1:req, op2:req + %ECHO @CatStr(<sha256msg2 >, < op1,>, < op2 >) + endm + sha256rnds2 MACRO op1:req, op2:req + %ECHO @CatStr(<sha256rnds2 >, < op1,>, < op2 >) + endm + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MNI ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + IFNDEF ML1200 + adcx macro x:req, z:req + %ECHO @CatStr(<adcx >, < x,>, < z >) + endm + adox macro x:req, z:req + %ECHO @CatStr(<adox >, < x,>, < z >) + endm + + IFNDEF ML1100 + IFNDEF D_ML900 + + phaddw macro x:req, y:req + %ECHO @CatStr(<phaddw >, < x,>, < y >) + endm + phaddd macro x:req, y:req + %ECHO @CatStr(<phaddd >, < x,>, < y >) + endm + phaddsw macro x:req, y:req + %ECHO @CatStr(<phaddsw >, < x,>, < y >) + endm + phsubw macro x:req, y:req + %ECHO @CatStr(<phsubw >, < x,>, < y >) + endm + phsubd macro x:req, y:req + %ECHO @CatStr(<phsubd >, < x,>, < y >) + endm + phsubsw macro x:req, y:req + %ECHO @CatStr(<phsubsw >, < x,>, < y >) + endm + pmaddubsw macro x:req, y:req + %ECHO @CatStr(<pmaddubsw >, < x,>, < y >) + endm + pmulhrsw macro x:req, y:req + %ECHO @CatStr(<pmulhrsw >, < x,>, < y >) + endm + pshufb macro x:req, y:req + %ECHO @CatStr(<pshufb >, < x,>, < y >) + endm + psignb macro x:req, y:req + %ECHO @CatStr(<psignb >, < x,>, < y >) + endm + psignw macro x:req, y:req + %ECHO @CatStr(<psignw >, < x,>, < y >) + endm + psignd macro x:req, y:req + %ECHO @CatStr(<psignd >, < x,>, < y >) + endm + palignr macro x:req, y:req, z:req + %ECHO @CatStr(<palignr >, < x,>, < y,>, < z >) + endm + pabsb macro x:req, y:req + %ECHO @CatStr(<pabsb >, < x,>, < y >) + endm + pabsw macro x:req, y:req + %ECHO @CatStr(<pabsw >, < x,>, < y >) + endm + pabsd macro x:req, y:req + %ECHO @CatStr(<pabsd >, < x,>, < y >) + endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; SNI ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + blendpd macro x:req, y:req, z:req + %ECHO @CatStr(<blendpd >, < x,>, < y,>, < z >) + endm + blendps macro x:req, y:req, z:req + %ECHO @CatStr(<blendps >, < x,>, < y,>, < z >) + endm + blendvpd macro x:req, y:req, z + %ECHO @CatStr(<blendvpd >, < x,>, < y>) + endm + blendvps macro x:req, y:req, z + %ECHO @CatStr(<blendvps >, < x,>, < y>) + endm + dppd macro x:req, y:req, z:req + %ECHO @CatStr(<dppd >, < x,>, < y,>, < z >) + endm + dpps macro x:req, y:req, z:req + %ECHO @CatStr(<dpps >, < x,>, < y,>, < z >) + endm + extractps macro x:req, y:req, z:req + %ECHO @CatStr(<extractps >, < x,>, < y,>, < z >) + endm + insertps macro x:req, y:req, z:req + %ECHO @CatStr(<insertps >, < x,>, < y,>, < z >) + endm + movntdqa macro x:req, y:req + %ECHO @CatStr(<movntdqa >, < x,>, < y>) + endm + mpsadbw macro x:req, y:req, z:req + %ECHO @CatStr(<mpsadbw >, < x,>, < y,>, < z >) + endm + packusdw macro x:req, y:req + %ECHO @CatStr(<packusdw >, < x,>, < y>) + endm + pblendvb macro x:req, y:req, z + %ECHO @CatStr(<pblendvb >, < x,>, < y>) + endm + pblendw macro x:req, y:req, z:req + %ECHO @CatStr(<pblendw >, < x,>, < y,>, < z >) + endm + pcmpeqq macro x:req, y:req + %ECHO @CatStr(<pcmpeqq >, < x,>, < y>) + endm + pextrb macro x:req, y:req, z:req + %ECHO @CatStr(<pextrb >, < x,>, < y,>, < z >) + endm + pextrd macro x:req, y:req, z:req + %ECHO @CatStr(<pextrd >, < x,>, < y,>, < z >) + endm + pextrq macro x:req, y:req, z:req + %ECHO @CatStr(<pextrq >, < x,>, < y,>, < z >) + endm +IF _IPP32E GE _IPP32E_Y8 + OPTION NOKEYWORD:<pextrw> + pextrw macro x:req, y:req, z:req + %ECHO @CatStr(<pextrw >, < x,>, < y,>, < z >) + endm +ENDIF + phminposuw macro x:req, y:req + %ECHO @CatStr(<phminposuw >, < x,>, < y>) + endm + pinsrb macro x:req, y:req, z:req + %ECHO @CatStr(<pinsrb >, < x,>, < y,>, < z >) + endm + pinsrd macro x:req, y:req, z:req + %ECHO @CatStr(<pinsrd >, < x,>, < y,>, < z >) + endm + pinsrq macro x:req, y:req, z:req + %ECHO @CatStr(<pinsrq >, < x,>, < y,>, < z >) + endm + pmaxsb macro x:req, y:req + %ECHO @CatStr(<pmaxsb >, < x,>, < y>) + endm + pmaxsd macro x:req, y:req + %ECHO @CatStr(<pmaxsd >, < x,>, < y>) + endm + pmaxud macro x:req, y:req + %ECHO @CatStr(<pmaxud >, < x,>, < y>) + endm + pmaxuw macro x:req, y:req + %ECHO @CatStr(<pmaxuw >, < x,>, < y>) + endm + pminsb macro x:req, y:req + %ECHO @CatStr(<pminsb >, < x,>, < y>) + endm + pminsd macro x:req, y:req + %ECHO @CatStr(<pminsd >, < x,>, < y>) + endm + pminud macro x:req, y:req + %ECHO @CatStr(<pminud >, < x,>, < y>) + endm + pminuw macro x:req, y:req + %ECHO @CatStr(<pminuw >, < x,>, < y>) + endm + pmovsxbw macro x:req, y:req + %ECHO @CatStr(<pmovsxbw >, < x,>, < y>) + endm + pmovsxbd macro x:req, y:req + %ECHO @CatStr(<pmovsxbd >, < x,>, < y>) + endm + pmovsxbq macro x:req, y:req + %ECHO @CatStr(<pmovsxbq >, < x,>, < y>) + endm + pmovsxwd macro x:req, y:req + %ECHO @CatStr(<pmovsxwd >, < x,>, < y>) + endm + pmovsxwq macro x:req, y:req + %ECHO @CatStr(<pmovsxwq >, < x,>, < y>) + endm + pmovsxdq macro x:req, y:req + %ECHO @CatStr(<pmovsxdq >, < x,>, < y>) + endm + pmovzxbw macro x:req, y:req + %ECHO @CatStr(<pmovzxbw >, < x,>, < y>) + endm + pmovzxbd macro x:req, y:req + %ECHO @CatStr(<pmovzxbd >, < x,>, < y>) + endm + pmovzxbq macro x:req, y:req + %ECHO @CatStr(<pmovzxbq >, < x,>, < y>) + endm + pmovzxwd macro x:req, y:req + %ECHO @CatStr(<pmovzxwd >, < x,>, < y>) + endm + pmovzxwq macro x:req, y:req + %ECHO @CatStr(<pmovzxwq >, < x,>, < y>) + endm + pmovzxdq macro x:req, y:req + %ECHO @CatStr(<pmovzxdq >, < x,>, < y>) + endm + pmuldq macro x:req, y:req + %ECHO @CatStr(<pmuldq >, < x,>, < y>) + endm + pmulld macro x:req, y:req + %ECHO @CatStr(<pmulld >, < x,>, < y>) + endm + ptest macro x:req, y:req + %ECHO @CatStr(<ptest >, < x,>, < y>) + endm + roundpd macro x:req, y:req, z:req + %ECHO @CatStr(<roundpd >, < x,>, < y,>, < z >) + endm + roundps macro x:req, y:req, z:req + %ECHO @CatStr(<roundps >, < x,>, < y,>, < z >) + endm + roundsd macro x:req, y:req, z:req + %ECHO @CatStr(<roundsd >, < x,>, < y,>, < z >) + endm + roundss macro x:req, y:req, z:req + %ECHO @CatStr(<roundss >, < x,>, < y,>, < z >) + endm +; SSE4.2 + pcmpestri macro x:req, y:req, z:req + %ECHO @CatStr(<pcmpestri >, < x,>, < y,>, < z >) + endm + pcmpestrm macro x:req, y:req, z:req + %ECHO @CatStr(<pcmpestrm >, < x,>, < y,>, < z >) + endm + pcmpistri macro x:req, y:req, z:req + %ECHO @CatStr(<pcmpistri >, < x,>, < y,>, < z >) + endm + pcmpistrm macro x:req, y:req, z:req + %ECHO @CatStr(<pcmpistrm >, < x,>, < y,>, < z >) + endm + pcmpgtq macro x:req, y:req + %ECHO @CatStr(<pcmpgtq >, < x,>, < y>) + endm + crc32 macro x:req, y:req + %ECHO @CatStr(<crc32 >, < x,>, < y>) + endm +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; WSM ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +aesenc macro x:req, y:req + %ECHO @CatStr(<aesenc >, < x,>, < y>) + endm +aesenclast macro x:req, y:req + %ECHO @CatStr(<aesenclast >, < x,>, < y>) + endm +aesdec macro x:req, y:req + %ECHO @CatStr(<aesdec >, < x,>, < y>) + endm +aesdeclast macro x:req, y:req + %ECHO @CatStr(<aesdeclast >, < x,>, < y>) + endm +aesimc macro x:req, y:req + %ECHO @CatStr(<aesimc >, < x,>, < y>) + endm +aeskeygenassist macro x:req, y:req, z:req + %ECHO @CatStr(<aeskeygenassist >, < x,>, < y,>, < z >) + endm +pclmulqdq macro x:req, y:req, z:req + %ECHO @CatStr(<pclmulqdq >, < x,>, < y,>, < z >) + endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; AVX ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +vaesenc macro x:req, y:req, z:req + %ECHO @CatStr(<vaesenc >, < x,>, < y,>, < z >) + endm +vaesenclast macro x:req, y:req, z:req + %ECHO @CatStr(<vaesenclast >, < x,>, < y,>, < z >) + endm +vaesdec macro x:req, y:req, z:req + %ECHO @CatStr(<vaesdec >, < x,>, < y,>, < z >) + endm +vaesdeclast macro x:req, y:req, z:req + %ECHO @CatStr(<vaesdeclast >, < x,>, < y,>, < z >) + endm +vaesimc macro x:req, y:req + %ECHO @CatStr(<vaesimc >, < x,>, < y>) + endm +vaeskeygenassist macro x:req, y:req, z:req + %ECHO @CatStr(<vaeskeygenassist >, < x,>, < y,>, < z >) + endm +vaddpd macro x:req, y:req, z:req + %ECHO @CatStr(<vaddpd >, < x,>, < y,>, < z >) + endm +vaddps macro x:req, y:req, z:req + %ECHO @CatStr(<vaddps >, < x,>, < y,>, < z >) + endm +vaddsd macro x:req, y:req, z:req + %ECHO @CatStr(<vaddsd >, < x,>, < y,>, < z >) + endm +vaddss macro x:req, y:req, z:req + %ECHO @CatStr(<vaddss >, < x,>, < y,>, < z >) + endm +vaddsubpd macro x:req, y:req, z:req + %ECHO @CatStr(<vaddsubpd >, < x,>, < y,>, < z >) + endm +vaddsubps macro x:req, y:req, z:req + %ECHO @CatStr(<vaddsubps >, < x,>, < y,>, < z >) + endm +vandpd macro x:req, y:req, z:req + %ECHO @CatStr(<vandpd >, < x,>, < y,>, < z >) + endm +vandps macro x:req, y:req, z:req + %ECHO @CatStr(<vandps >, < x,>, < y,>, < z >) + endm +vandnpd macro x:req, y:req, z:req + %ECHO @CatStr(<vandnpd >, < x,>, < y,>, < z >) + endm +vandnps macro x:req, y:req, z:req + %ECHO @CatStr(<vandnps >, < x,>, < y,>, < z >) + endm +vblendpd macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vblendpd >, < x,>, < y,>, < z,>, < imm>) + endm +vblendps macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vblendps >, < x,>, < y,>, < z,>, < imm>) + endm +vblendvpd macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vblendvpd >, < x,>, < y,>, < z,>, < imm>) + endm +vblendvps macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vblendvps >, < x,>, < y,>, < z,>, < imm>) + endm +vbroadcastss macro x:req, y:req + %ECHO @CatStr(<vbroadcastss >, < x,>, < y>) + endm +vbroadcastsd macro x:req, y:req + %ECHO @CatStr(<vbroadcastsd >, < x,>, < y>) + endm +vbroadcastf128 macro x:req, y:req + %ECHO @CatStr(<vbroadcastf128 >, < x,>, < y>) + endm +vcmpeqpd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpeqpd >, < x,>, < y,>, < z >) + endm +vcmpltpd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpltpd >, < x,>, < y,>, < z >) + endm +vcmplepd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmplepd >, < x,>, < y,>, < z >) + endm +vcmpunordpd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpunordpd >, < x,>, < y,>, < z >) + endm +vcmpneqpd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpneqpd >, < x,>, < y,>, < z >) + endm +vcmpnltpd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpnltpd >, < x,>, < y,>, < z >) + endm +vcmpnlepd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpnlepd >, < x,>, < y,>, < z >) + endm +vcmpordpd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpordpd >, < x,>, < y,>, < z >) + endm +vcmppd macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vcmppd >, < x,>, < y,>, < z,>, < imm>) + endm +vcmpps macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vcmpps >, < x,>, < y,>, < z,>, < imm>) + endm +vcmpsd macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vcmpsd >, < x,>, < y,>, < z,>, < imm>) + endm +vcmpeqps macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpeqps >, < x,>, < y,>, < z >) + endm +vcmpltps macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpltps >, < x,>, < y,>, < z >) + endm +vcmpleps macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpleps >, < x,>, < y,>, < z >) + endm +vcmpunordps macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpunordps >, < x,>, < y,>, < z >) + endm +vcmpneqps macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpneqps >, < x,>, < y,>, < z >) + endm +vcmpnltps macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpnltps >, < x,>, < y,>, < z >) + endm +vcmpnleps macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpnleps >, < x,>, < y,>, < z >) + endm +vcmpordps macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpordps >, < x,>, < y,>, < z >) + endm +vcmpeqsd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpeqsd >, < x,>, < y,>, < z >) + endm +vcmpltsd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpltsd >, < x,>, < y,>, < z >) + endm +vcmplesd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmplesd >, < x,>, < y,>, < z >) + endm +vcmpunordsd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpunordsd >, < x,>, < y,>, < z >) + endm +vcmpneqsd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpneqsd >, < x,>, < y,>, < z >) + endm +vcmpnltsd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpnltsd >, < x,>, < y,>, < z >) + endm +vcmpnlesd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpnlesd >, < x,>, < y,>, < z >) + endm +vcmpordsd macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpordsd >, < x,>, < y,>, < z >) + endm +vcmpss macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vcmpss >, < x,>, < y,>, < z,>, < imm>) + endm +vcmpeqss macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpeqss >, < x,>, < y,>, < z >) + endm +vcmpltss macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpltss >, < x,>, < y,>, < z >) + endm +vcmpless macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpless >, < x,>, < y,>, < z >) + endm +vcmpunordss macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpunordss >, < x,>, < y,>, < z >) + endm +vcmpneqss macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpneqss >, < x,>, < y,>, < z >) + endm +vcmpnltss macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpnltss >, < x,>, < y,>, < z >) + endm +vcmpnless macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpnless >, < x,>, < y,>, < z >) + endm +vcmpordss macro x:req, y:req, z:req + %ECHO @CatStr(<vcmpordss >, < x,>, < y,>, < z >) + endm +vcomisd macro x:req, y:req + %ECHO @CatStr(<vcomisd >, < x,>, < y>) + endm +vcomiss macro x:req, y:req + %ECHO @CatStr(<vcomiss >, < x,>, < y>) + endm +vcvtdq2pd macro x:req, y:req + %ECHO @CatStr(<vcvtdq2pd >, < x,>, < y>) + endm +vcvtdq2ps macro x:req, y:req + %ECHO @CatStr(<vcvtdq2ps >, < x,>, < y>) + endm +vcvtpd2dq macro x:req, y:req + %ECHO @CatStr(<vcvtpd2dq >, < x,>, < y>) + endm +vcvtpd2ps macro x:req, y:req + %ECHO @CatStr(<vcvtpd2ps >, < x,>, < y>) + endm +vcvtps2dq macro x:req, y:req + %ECHO @CatStr(<vcvtps2dq >, < x,>, < y>) + endm +vcvtps2pd macro x:req, y:req + %ECHO @CatStr(<vcvtps2pd >, < x,>, < y>) + endm +vcvtsd2si macro x:req, y:req + %ECHO @CatStr(<vcvtsd2si >, < x,>, < y>) + endm +vcvtsd2ss macro x:req, y:req, z:req + %ECHO @CatStr(<vcvtsd2ss >, < x,>, < y,>, < z>) + endm +vcvtsi2sd macro x:req, y:req, z:req + %ECHO @CatStr(<vcvtsi2sd >, < x,>, < y,>, < z>) + endm +vcvtsi2ss macro x:req, y:req, z:req + %ECHO @CatStr(<vcvtsi2ss >, < x,>, < y,>, < z>) + endm +vcvtss2sd macro x:req, y:req, z:req + %ECHO @CatStr(<vcvtss2sd >, < x,>, < y,>, < z>) + endm +vcvtss2si macro x:req, y:req + %ECHO @CatStr(<vcvtss2si >, < x,>, < y>) + endm +vcvttpd2dq macro x:req, y:req + %ECHO @CatStr(<vcvttpd2dq >, < x,>, < y>) + endm +vcvttps2dq macro x:req, y:req + %ECHO @CatStr(<vcvttps2dq >, < x,>, < y>) + endm +vcvttsd2si macro x:req, y:req + %ECHO @CatStr(<vcvttsd2si >, < x,>, < y>) + endm +vcvttss2si macro x:req, y:req + %ECHO @CatStr(<vcvttss2si >, < x,>, < y>) + endm +vdivpd macro x:req, y:req, z:req + %ECHO @CatStr(<vdivpd >, < x,>, < y,>, < z >) + endm +vdivps macro x:req, y:req, z:req + %ECHO @CatStr(<vdivps >, < x,>, < y,>, < z >) + endm +vdivsd macro x:req, y:req, z:req + %ECHO @CatStr(<vdivsd >, < x,>, < y,>, < z >) + endm +vdivss macro x:req, y:req, z:req + %ECHO @CatStr(<vdivss >, < x,>, < y,>, < z >) + endm +vdppd macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vdppd >, < x,>, < y,>, < z,>, < imm>) + endm +vdpps macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vdpps >, < x,>, < y,>, < z,>, < imm>) + endm +vextractf128 macro x:req, y:req, z:req + %ECHO @CatStr(<vextractf128 >, < x,>, < y,>, < z >) + endm +vextractps macro x:req, y:req, z:req + %ECHO @CatStr(<vextractps >, < x,>, < y,>, < z >) + endm +vhaddpd macro x:req, y:req, z:req + %ECHO @CatStr(<vhaddpd >, < x,>, < y,>, < z >) + endm +vhaddps macro x:req, y:req, z:req + %ECHO @CatStr(<vhaddps >, < x,>, < y,>, < z >) + endm +vhsubpd macro x:req, y:req, z:req + %ECHO @CatStr(<vhsubpd >, < x,>, < y,>, < z >) + endm +vhsubps macro x:req, y:req, z:req + %ECHO @CatStr(<vhsubps >, < x,>, < y,>, < z >) + endm +vinsertf128 macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vinsertf128 >, < x,>, < y,>, < z,>, < imm>) + endm +vinsertps macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vinsertps >, < x,>, < y,>, < z,>, < imm>) + endm +vlddqu macro x:req, y:req + %ECHO @CatStr(<vlddqu >, < x,>, < y>) + endm +vldmxcsr macro x:req + %ECHO @CatStr(<vldmxcsr >, < x>) + endm +vmaskmovdqu macro x:req, y:req + %ECHO @CatStr(<vmaskmovdqu >, < x,>, < y>) + endm +vmaskmovpd macro x:req, y:req, z:req + %ECHO @CatStr(<vmaskmovpd >, < x,>, < y,>, < z >) + endm +vmaskmovps macro x:req, y:req, z:req + %ECHO @CatStr(<vmaskmovps >, < x,>, < y,>, < z >) + endm +vmaxpd macro x:req, y:req, z:req + %ECHO @CatStr(<vmaxpd >, < x,>, < y,>, < z >) + endm +vmaxps macro x:req, y:req, z:req + %ECHO @CatStr(<vmaxps >, < x,>, < y,>, < z >) + endm +vmaxsd macro x:req, y:req, z:req + %ECHO @CatStr(<vmaxsd >, < x,>, < y,>, < z >) + endm +vmaxss macro x:req, y:req, z:req + %ECHO @CatStr(<vmaxss >, < x,>, < y,>, < z >) + endm +vminpd macro x:req, y:req, z:req + %ECHO @CatStr(<vminpd >, < x,>, < y,>, < z >) + endm +vminps macro x:req, y:req, z:req + %ECHO @CatStr(<vminps >, < x,>, < y,>, < z >) + endm +vminsd macro x:req, y:req, z:req + %ECHO @CatStr(<vminsd >, < x,>, < y,>, < z >) + endm +vminss macro x:req, y:req, z:req + %ECHO @CatStr(<vminss >, < x,>, < y,>, < z >) + endm +vmovapd macro x:req, y:req + %ECHO @CatStr(<vmovapd >, < x,>, < y>) + endm +vmovaps macro x:req, y:req + %ECHO @CatStr(<vmovaps >, < x,>, < y>) + endm +vmovd macro x:req, y:req + %ECHO @CatStr(<vmovd >, < x,>, < y>) + endm +vmovddup macro x:req, y:req + %ECHO @CatStr(<vmovddup >, < x,>, < y>) + endm +vmovdqa macro x:req, y:req + %ECHO @CatStr(<vmovdqa >, < x,>, < y>) + endm +vmovdqu macro x:req, y:req, z:req + %ECHO @CatStr(<vmovdqu >, < x,>, < y>) + endm +vmovhlps macro x:req, y:req, z:req + %ECHO @CatStr(<vmovhlps >, < x,>, < y,>, < z>) + endm +vmovhpd macro x:req, y:req, z + IFNB <z> + %ECHO @CatStr(<vmovhpd >, < x,>, < y,>, < z>) + ELSE + %ECHO @CatStr(<vmovhpd >, < x,>, < y>) + ENDIF +endm +vmovhps macro x:req, y:req, z + IFNB <z> + %ECHO @CatStr(<vmovhps >, < x,>, < y,>, < z>) + ELSE + %ECHO @CatStr(<vmovhps >, < x,>, < y>) + ENDIF +endm +vmovlhps macro x:req, y:req, z:req + %ECHO @CatStr(<vmovlhps >, < x,>, < y,>, < z>) + endm +vmovlpd macro x:req, y:req, z + IFNB <z> + %ECHO @CatStr(<vmovlpd >, < x,>, < y,>, < z>) + ELSE + %ECHO @CatStr(<vmovlpd >, < x,>, < y>) + ENDIF +endm +vmovlps macro x:req, y:req, z + IFNB <z> + %ECHO @CatStr(<vmovlps >, < x,>, < y,>, < z>) + ELSE + %ECHO @CatStr(<vmovlps >, < x,>, < y>) + ENDIF +endm +vmovmskpd macro x:req, y:req + %ECHO @CatStr(<vmovmskpd >, < x,>, < y>) + endm +vmovmskps macro x:req, y:req + %ECHO @CatStr(<vmovmskps >, < x,>, < y>) + endm +vmovntdq macro x:req, y:req + %ECHO @CatStr(<vmovntdq >, < x,>, < y>) + endm +vmovntdqa macro x:req, y:req + %ECHO @CatStr(<vmovntdqa >, < x,>, < y>) + endm +vmovntpd macro x:req, y:req + %ECHO @CatStr(<vmovntpd >, < x,>, < y>) + endm +vmovntps macro x:req, y:req + %ECHO @CatStr(<vmovntps >, < x,>, < y>) + endm +vmovntq macro x:req, y:req + %ECHO @CatStr(<vmovntq >, < x,>, < y>) + endm +vmovq macro x:req, y:req + %ECHO @CatStr(<vmovq >, < x,>, < y>) + endm +vmovsd macro x:req, y:req, z + IFNB <z> + %ECHO @CatStr(<vmovsd >, < x,>, < y,>, < z>) + ELSE + %ECHO @CatStr(<vmovsd >, < x,>, < y>) + ENDIF +endm +vmovshdup macro x:req, y:req + %ECHO @CatStr(<vmovshdup >, < x,>, < y>) + endm +vmovsldup macro x:req, y:req + %ECHO @CatStr(<vmovsldup >, < x,>, < y>) + endm +vmovss macro x:req, y:req, z + IFNB <z> + %ECHO @CatStr(<vmovss >, < x,>, < y,>, < z>) + ELSE + %ECHO @CatStr(<vmovss >, < x,>, < y>) + ENDIF +endm +vmovupd macro x:req, y:req + %ECHO @CatStr(<vmovupd >, < x,>, < y>) + endm +vmovups macro x:req, y:req + %ECHO @CatStr(<vmovups >, < x,>, < y>) + endm +vmpsadbw macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vmpsadbw >, < x,>, < y,>, < z,>, < imm>) + endm +vmulpd macro x:req, y:req, z:req + %ECHO @CatStr(<vmulpd >, < x,>, < y,>, < z >) + endm +vmulps macro x:req, y:req, z:req + %ECHO @CatStr(<vmulps >, < x,>, < y,>, < z >) + endm +vmulsd macro x:req, y:req, z:req + %ECHO @CatStr(<vmulsd >, < x,>, < y,>, < z >) + endm +vmulss macro x:req, y:req, z:req + %ECHO @CatStr(<vmulss >, < x,>, < y,>, < z >) + endm +vorpd macro x:req, y:req, z:req + %ECHO @CatStr(<vorpd >, < x,>, < y,>, < z >) + endm +vorps macro x:req, y:req, z:req + %ECHO @CatStr(<vorps >, < x,>, < y,>, < z >) + endm + +vpabsb macro x:req, y:req + %ECHO @CatStr(<vpabsb >, < x,>, < y>) + endm +vpabsw macro x:req, y:req + %ECHO @CatStr(<vpabsw >, < x,>, < y>) + endm +vpabsd macro x:req, y:req + %ECHO @CatStr(<vpabsd >, < x,>, < y>) + endm +vpackssdw macro x:req, y:req, z:req + %ECHO @CatStr(<vpackssdw >, < x,>, < y,>, < z >) + endm +vpacksswb macro x:req, y:req, z:req + %ECHO @CatStr(<vpacksswb >, < x,>, < y,>, < z >) + endm +vpackuswb macro x:req, y:req, z:req + %ECHO @CatStr(<vpackuswb >, < x,>, < y,>, < z >) + endm +vpackusdw macro x:req, y:req, z:req + %ECHO @CatStr(<vpackusdw >, < x,>, < y,>, < z >) + endm +vpaddb macro x:req, y:req, z:req + %ECHO @CatStr(<vpaddb >, < x,>, < y,>, < z >) + endm +vpaddd macro x:req, y:req, z:req + %ECHO @CatStr(<vpaddd >, < x,>, < y,>, < z >) + endm +vpaddq macro x:req, y:req, z:req + %ECHO @CatStr(<vpaddq >, < x,>, < y,>, < z >) + endm +vpaddsb macro x:req, y:req, z:req + %ECHO @CatStr(<vpaddsb >, < x,>, < y,>, < z >) + endm +vpaddsw macro x:req, y:req, z:req + %ECHO @CatStr(<vpaddsw >, < x,>, < y,>, < z >) + endm +vpaddusb macro x:req, y:req, z:req + %ECHO @CatStr(<vpaddusb >, < x,>, < y,>, < z >) + endm +vpaddusw macro x:req, y:req, z:req + %ECHO @CatStr(<vpaddusw >, < x,>, < y,>, < z >) + endm +vpaddw macro x:req, y:req, z:req + %ECHO @CatStr(<vpaddw >, < x,>, < y,>, < z >) + endm +vpand macro x:req, y:req, z:req + %ECHO @CatStr(<vpand >, < x,>, < y,>, < z >) + endm +vpandn macro x:req, y:req, z:req + %ECHO @CatStr(<vpandn >, < x,>, < y,>, < z >) + endm +vpavgb macro x:req, y:req, z:req + %ECHO @CatStr(<vpavgb >, < x,>, < y,>, < z >) + endm +vpavgw macro x:req, y:req, z:req + %ECHO @CatStr(<vpavgw >, < x,>, < y,>, < z >) + endm +vpalignr macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vpalignr >, < x,>, < y,>, < z,>, < imm>) + endm +vpblendvb macro x:req, y:req, z:req, q:req + %ECHO @CatStr(<vpblendvb >, < x,>, < y,>, < z,>, < q>) + endm +vpblendw macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vpblendw >, < x,>, < y,>, < z,>, < imm>) + endm +vpclmulqdq macro x:req, y:req, z:req + %ECHO @CatStr(<vpclmulqdq >, < x,>, < y,>, < z >) + endm +vpcmpestri macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpestri >, < x,>, < y,>, < z >) + endm +vpcmpestrm macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpestrm >, < x,>, < y,>, < z >) + endm +vpcmpistri macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpistri >, < x,>, < y,>, < z >) + endm +vpcmpistrm macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpistrm >, < x,>, < y,>, < z >) + endm +vpcmpeqb macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpeqb >, < x,>, < y,>, < z >) + endm +vpcmpeqd macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpeqd >, < x,>, < y,>, < z >) + endm +vpcmpeqw macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpeqw >, < x,>, < y,>, < z >) + endm +vpcmpeqq macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpeqq >, < x,>, < y,>, < z >) + endm +vpcmpgtb macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpgtb >, < x,>, < y,>, < z >) + endm +vpcmpgtd macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpgtd >, < x,>, < y,>, < z >) + endm +vpcmpgtw macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpgtw >, < x,>, < y,>, < z >) + endm +vpcmpgtq macro x:req, y:req, z:req + %ECHO @CatStr(<vpcmpgtq >, < x,>, < y,>, < z >) + endm +vpermilpd macro x:req, y:req, z:req + %ECHO @CatStr(<vpermilpd >, < x,>, < y,>, < z >) + endm +vpermil2pd macro x:req, y:req, z:req, v:req, imm:req + %ECHO @CatStr(<vpermil2pd >, < x,>, < y,>, < z,>, < v,>, < imm>) + endm +vpermilps macro x:req, y:req, z:req + %ECHO @CatStr(<vpermilps >, < x,>, < y,>, < z >) + endm +vpermil2ps macro x:req, y:req, z:req, v:req, imm:req + %ECHO @CatStr(<vpermil2ps >, < x,>, < y,>, < z,>, < v,>, < imm>) + endm +vperm2f128 macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vperm2f128 >, < x,>, < y,>, < z,>, < imm>) + endm +vpextrb macro x:req, y:req, z:req + %ECHO @CatStr(<vpextrb >, < x,>, < y,>, < z >) + endm +vpextrd macro x:req, y:req, z:req + %ECHO @CatStr(<vpextrd >, < x,>, < y,>, < z >) + endm +vpextrq macro x:req, y:req, z:req + %ECHO @CatStr(<vpextrq >, < x,>, < y,>, < z >) + endm +vpextrw macro x:req, y:req, z:req + %ECHO @CatStr(<vpextrw >, < x,>, < y,>, < z >) + endm +vphaddw macro x:req, y:req, z:req + %ECHO @CatStr(<vphaddw >, < x,>, < y,>, < z >) + endm +vphaddd macro x:req, y:req, z:req + %ECHO @CatStr(<vphaddd >, < x,>, < y,>, < z >) + endm +vphaddsw macro x:req, y:req, z:req + %ECHO @CatStr(<vphaddsw >, < x,>, < y,>, < z >) + endm +vphminposuw macro x:req, y:req + %ECHO @CatStr(<vphminposuw >, < x,>, < y>) + endm +vphsubw macro x:req, y:req, z:req + %ECHO @CatStr(<vphsubw >, < x,>, < y,>, < z >) + endm +vphsubd macro x:req, y:req, z:req + %ECHO @CatStr(<vphsubd >, < x,>, < y,>, < z >) + endm +vphsubsw macro x:req, y:req, z:req + %ECHO @CatStr(<vphsubsw >, < x,>, < y,>, < z >) + endm +vpinsrb macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vpinsrb >, < x,>, < y,>, < z,>, < imm>) + endm +vpinsrd macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vpinsrd >, < x,>, < y,>, < z,>, < imm>) + endm +vpinsrq macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vpinsrq >, < x,>, < y,>, < z,>, < imm>) + endm +vpinsrw macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vpinsrw >, < x,>, < y,>, < z,>, < imm>) + endm +vpmaddwd macro x:req, y:req, z:req + %ECHO @CatStr(<vpmaddwd >, < x,>, < y,>, < z >) + endm +vpmaddubsw macro x:req, y:req, z:req + %ECHO @CatStr(<vpmaddubsw >, < x,>, < y,>, < z >) + endm +vpmaxsb macro x:req, y:req, z:req + %ECHO @CatStr(<vpmaxsb >, < x,>, < y,>, < z >) + endm +vpmaxsd macro x:req, y:req, z:req + %ECHO @CatStr(<vpmaxsd >, < x,>, < y,>, < z >) + endm +vpmaxsw macro x:req, y:req, z:req + %ECHO @CatStr(<vpmaxsw >, < x,>, < y,>, < z >) + endm +vpmaxub macro x:req, y:req, z:req + %ECHO @CatStr(<vpmaxub >, < x,>, < y,>, < z >) + endm +vpmaxud macro x:req, y:req, z:req + %ECHO @CatStr(<vpmaxud >, < x,>, < y,>, < z >) + endm +vpmaxuw macro x:req, y:req, z:req + %ECHO @CatStr(<vpmaxuw >, < x,>, < y,>, < z >) + endm +vpminsb macro x:req, y:req, z:req + %ECHO @CatStr(<vpminsb >, < x,>, < y,>, < z >) + endm +vpminsd macro x:req, y:req, z:req + %ECHO @CatStr(<vpminsd >, < x,>, < y,>, < z >) + endm +vpminsw macro x:req, y:req, z:req + %ECHO @CatStr(<vpminsw >, < x,>, < y,>, < z >) + endm +vpminub macro x:req, y:req, z:req + %ECHO @CatStr(<vpminub >, < x,>, < y,>, < z >) + endm +vpminud macro x:req, y:req, z:req + %ECHO @CatStr(<vpminud >, < x,>, < y,>, < z >) + endm +vpminuw macro x:req, y:req, z:req + %ECHO @CatStr(<vpminuw >, < x,>, < y,>, < z >) + endm +vpmovmskb macro x:req, y:req + %ECHO @CatStr(<vpmovmskb >, < x,>, < y>) + endm +vpmovsxbw macro x:req, y:req + %ECHO @CatStr(<vpmovsxbw >, < x,>, < y>) + endm +vpmovsxbd macro x:req, y:req + %ECHO @CatStr(<vpmovsxbd >, < x,>, < y>) + endm +vpmovsxbq macro x:req, y:req + %ECHO @CatStr(<vpmovsxbq >, < x,>, < y>) + endm +vpmovsxwd macro x:req, y:req + %ECHO @CatStr(<vpmovsxwd >, < x,>, < y>) + endm +vpmovsxwq macro x:req, y:req + %ECHO @CatStr(<vpmovsxwq >, < x,>, < y>) + endm +vpmovsxdq macro x:req, y:req + %ECHO @CatStr(<vpmovsxdq >, < x,>, < y>) + endm +vpmovzxbw macro x:req, y:req + %ECHO @CatStr(<vpmovzxbw >, < x,>, < y>) + endm +vpmovzxbd macro x:req, y:req + %ECHO @CatStr(<vpmovzxbd >, < x,>, < y>) + endm +vpmovzxbq macro x:req, y:req + %ECHO @CatStr(<vpmovzxbq >, < x,>, < y>) + endm +vpmovzxwd macro x:req, y:req + %ECHO @CatStr(<vpmovzxwd >, < x,>, < y>) + endm +vpmovzxwq macro x:req, y:req + %ECHO @CatStr(<vpmovzxwq >, < x,>, < y>) + endm +vpmovzxdq macro x:req, y:req + %ECHO @CatStr(<vpmovzxdq >, < x,>, < y>) + endm +vpmulhuw macro x:req, y:req, z:req + %ECHO @CatStr(<vpmulhuw >, < x,>, < y,>, < z >) + endm +vpmulhrsw macro x:req, y:req, z:req + %ECHO @CatStr(<vpmulhrsw >, < x,>, < y,>, < z >) + endm +vpmulhw macro x:req, y:req, z:req + %ECHO @CatStr(<vpmulhw >, < x,>, < y,>, < z >) + endm +vpmullw macro x:req, y:req, z:req + %ECHO @CatStr(<vpmullw >, < x,>, < y,>, < z >) + endm +vpmulld macro x:req, y:req, z:req + %ECHO @CatStr(<vpmulld >, < x,>, < y,>, < z >) + endm +vpmuludq macro x:req, y:req, z:req + %ECHO @CatStr(<vpmuludq >, < x,>, < y,>, < z >) + endm +vpmuldq macro x:req, y:req, z:req + %ECHO @CatStr(<vpmuldq >, < x,>, < y,>, < z >) + endm +vpor macro x:req, y:req, z:req + %ECHO @CatStr(<vpor >, < x,>, < y,>, < z >) + endm +vpsadbw macro x:req, y:req, z:req + %ECHO @CatStr(<vpsadbw >, < x,>, < y,>, < z >) + endm +vpshufb macro x:req, y:req, z:req + %ECHO @CatStr(<vpshufb >, < x,>, < y,>, < z >) + endm +vpshufd macro x:req, y:req, z:req + %ECHO @CatStr(<vpshufd >, < x,>, < y,>, < z >) + endm +vpshufhw macro x:req, y:req, z:req + %ECHO @CatStr(<vpshufhw >, < x,>, < y,>, < z >) + endm +vpshuflw macro x:req, y:req, z:req + %ECHO @CatStr(<vpshuflw >, < x,>, < y,>, < z >) + endm +vpsignb macro x:req, y:req, z:req + %ECHO @CatStr(<vpsignb >, < x,>, < y,>, < z >) + endm +vpsignw macro x:req, y:req, z:req + %ECHO @CatStr(<vpsignw >, < x,>, < y,>, < z >) + endm +vpsignd macro x:req, y:req, z:req + %ECHO @CatStr(<vpsignd >, < x,>, < y,>, < z >) + endm +vpslld macro x:req, y:req, z:req + %ECHO @CatStr(<vpslld >, < x,>, < y,>, < z >) + endm +vpslldq macro x:req, y:req, z:req + %ECHO @CatStr(<vpslldq >, < x,>, < y,>, < z >) + endm +vpsllq macro x:req, y:req, z:req + %ECHO @CatStr(<vpsllq >, < x,>, < y,>, < z >) + endm +vpsllw macro x:req, y:req, z:req + %ECHO @CatStr(<vpsllw >, < x,>, < y,>, < z >) + endm +vpsrad macro x:req, y:req, z:req + %ECHO @CatStr(<vpsrad >, < x,>, < y,>, < z >) + endm +vpsraw macro x:req, y:req, z:req + %ECHO @CatStr(<vpsraw >, < x,>, < y,>, < z >) + endm +vpsrld macro x:req, y:req, z:req + %ECHO @CatStr(<vpsrld >, < x,>, < y,>, < z >) + endm +vpsrldq macro x:req, y:req, z:req + %ECHO @CatStr(<vpsrldq >, < x,>, < y,>, < z >) + endm +vpsrlq macro x:req, y:req, z:req + %ECHO @CatStr(<vpsrlq >, < x,>, < y,>, < z >) + endm +vpsrlw macro x:req, y:req, z:req + %ECHO @CatStr(<vpsrlw >, < x,>, < y,>, < z >) + endm +vptest macro x:req, y:req + %ECHO @CatStr(<vptest >, < x,>, < y>) + endm +vpsubb macro x:req, y:req, z:req + %ECHO @CatStr(<vpsubb >, < x,>, < y,>, < z >) + endm +vpsubd macro x:req, y:req, z:req + %ECHO @CatStr(<vpsubd >, < x,>, < y,>, < z >) + endm +vpsubq macro x:req, y:req, z:req + %ECHO @CatStr(<vpsubq >, < x,>, < y,>, < z >) + endm +vpsubsb macro x:req, y:req, z:req + %ECHO @CatStr(<vpsubsb >, < x,>, < y,>, < z >) + endm +vpsubsw macro x:req, y:req, z:req + %ECHO @CatStr(<vpsubsw >, < x,>, < y,>, < z >) + endm +vpsubusb macro x:req, y:req, z:req + %ECHO @CatStr(<vpsubusb >, < x,>, < y,>, < z >) + endm +vpsubusw macro x:req, y:req, z:req + %ECHO @CatStr(<vpsubusw >, < x,>, < y,>, < z >) + endm +vpsubw macro x:req, y:req, z:req + %ECHO @CatStr(<vpsubw >, < x,>, < y,>, < z >) + endm +vpunpckhbw macro x:req, y:req, z:req + %ECHO @CatStr(<vpunpckhbw >, < x,>, < y,>, < z >) + endm +vpunpckhdq macro x:req, y:req, z:req + %ECHO @CatStr(<vpunpckhdq >, < x,>, < y,>, < z >) + endm +vpunpckhqdq macro x:req, y:req, z:req + %ECHO @CatStr(<vpunpckhqdq >, < x,>, < y,>, < z >) + endm +vpunpckhwd macro x:req, y:req, z:req + %ECHO @CatStr(<vpunpckhwd >, < x,>, < y,>, < z >) + endm +vpunpcklbw macro x:req, y:req, z:req + %ECHO @CatStr(<vpunpcklbw >, < x,>, < y,>, < z >) + endm +vpunpckldq macro x:req, y:req, z:req + %ECHO @CatStr(<vpunpckldq >, < x,>, < y,>, < z >) + endm +vpunpcklqdq macro x:req, y:req, z:req + %ECHO @CatStr(<vpunpcklqdq >, < x,>, < y,>, < z >) + endm +vpunpcklwd macro x:req, y:req, z:req + %ECHO @CatStr(<vpunpcklwd >, < x,>, < y,>, < z >) + endm +vrcpps macro x:req, y:req + %ECHO @CatStr(<vrcpps >, < x,>, < y>) + endm +vrcpss macro x:req, y:req, z:req + %ECHO @CatStr(<vrcpss >, < x,>, < y>) + endm +vrsqrtps macro x:req, y:req + %ECHO @CatStr(<vrsqrtps >, < x,>, < y>) + endm +vrsqrtss macro x:req, y:req + %ECHO @CatStr(<vrsqrtss >, < x,>, < y>) + endm +vroundpd macro x:req, y:req, z:req + %ECHO @CatStr(<vroundpd >, < x,>, < y,>, < z >) + endm +vroundps macro x:req, y:req, z:req + %ECHO @CatStr(<vroundps >, < x,>, < y,>, < z >) + endm +vroundsd macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vroundsd >, < x,>, < y,>, < z,>, < imm>) + endm +vroundss macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vroundss >, < x,>, < y,>, < z,>, < imm>) + endm +vshufpd macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vshufpd >, < x,>, < y,>, < z,>, < imm>) + endm +vshufps macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vshufps >, < x,>, < y,>, < z,>, < imm>) + endm +vsqrtpd macro x:req, y:req + %ECHO @CatStr(<vsqrtpd >, < x,>, < y>) + endm +vsqrtps macro x:req, y:req + %ECHO @CatStr(<vsqrtps >, < x,>, < y>) + endm +vsqrtsd macro x:req, y:req, z:req + %ECHO @CatStr(<vsqrtsd >, < x,>, < y,>, < z >) + endm +vsqrtss macro x:req, y:req, z:req + %ECHO @CatStr(<vsqrtss >, < x,>, < y,>, < z >) + endm +vstmxcsr macro x:req + %ECHO @CatStr(<vstmxcsr >, < x>) + endm +vsubpd macro x:req, y:req, z:req + %ECHO @CatStr(<vsubpd >, < x,>, < y,>, < z >) + endm +vsubps macro x:req, y:req, z:req + %ECHO @CatStr(<vsubps >, < x,>, < y,>, < z >) + endm +vsubsd macro x:req, y:req, z:req + %ECHO @CatStr(<vsubsd >, < x,>, < y,>, < z >) + endm +vsubss macro x:req, y:req, z:req + %ECHO @CatStr(<vsubss >, < x,>, < y,>, < z >) + endm +vucomisd macro x:req, y:req + %ECHO @CatStr(<vucomisd >, < x,>, < y>) + endm +vucomiss macro x:req, y:req + %ECHO @CatStr(<vucomiss >, < x,>, < y>) + endm +vunpckhpd macro x:req, y:req, z:req + %ECHO @CatStr(<vunpckhpd >, < x,>, < y,>, < z >) + endm +vunpckhps macro x:req, y:req, z:req + %ECHO @CatStr(<vunpckhps >, < x,>, < y,>, < z >) + endm +vunpcklpd macro x:req, y:req, z:req + %ECHO @CatStr(<vunpcklpd >, < x,>, < y,>, < z >) + endm +vunpcklps macro x:req, y:req, z:req + %ECHO @CatStr(<vunpcklps >, < x,>, < y,>, < z >) + endm +vxorpd macro x:req, y:req, z:req + %ECHO @CatStr(<vxorpd >, < x,>, < y,>, < z >) + endm +vxorps macro x:req, y:req, z:req + %ECHO @CatStr(<vxorps >, < x,>, < y,>, < z >) + endm +vzeroall macro + %ECHO @CatStr(<vzeroall>) + endm +vzeroupper macro + %ECHO @CatStr(<vzeroupper>) + endm + ELSE + OPTION NOKEYWORD:<blendvpd> + blendvpd macro x:req, y:req, z + %ECHO @CatStr(<blendvpd >, < x,>, < y>) + endm + OPTION NOKEYWORD:<blendvps> + blendvps macro x:req, y:req, z + %ECHO @CatStr(<blendvps >, < x,>, < y>) + endm + OPTION NOKEYWORD:<pblendvb> + pblendvb macro x:req, y:req, z + %ECHO @CatStr(<pblendvb >, < x,>, < y>) + endm + +;; OPTION NOKEYWORD:<vpbroadcastq> + vpbroadcastq macro x:req, y:req + %ECHO @CatStr(<vpbroadcastq >, <x, >, <y >) + endm + OPTION NOKEYWORD:<vpaddq> + vpaddq macro x:req, y:req, z:req + %ECHO @CatStr(<vpaddq >, < x,>, < y,>, < z >) + endm + OPTION NOKEYWORD:<vpmuludq> + vpmuludq macro x:req, y:req, z:req + %ECHO @CatStr(<vpmuludq >, < x,>, < y,>, < z >) + endm + + OPTION NOKEYWORD:<vpxor> + vpxor macro x:req, y:req, z:req + %ECHO @CatStr(<vpxor >, < x,>, < y,>, < z >) + endm + +ENDIF ;IFNDEF D_ML900 + + +vfmadd132pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd132pd >, < x,>, < y,>, < z >) + endm +vfmadd213pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd213pd >, < x,>, < y,>, < z >) + endm +vfmadd231pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd231pd >, < x,>, < y,>, < z >) + endm +vfmaddrnd231pd macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vfmaddrnd231pd >, < x,>, < y,>, < z,>, < imm>) + endm +vfmadd132ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd132ps >, < x,>, < y,>, < z >) + endm +vfmadd213ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd213ps >, < x,>, < y,>, < z >) + endm +vfmadd231ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd231ps >, < x,>, < y,>, < z >) + endm +vfmaddrnd231ps macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vfmaddrnd231ps >, < x,>, < y,>, < z,>, < imm>) + endm +vfmadd132sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd132sd >, < x,>, < y,>, < z >) + endm +vfmadd213sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd213sd >, < x,>, < y,>, < z >) + endm +vfmadd231sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd231sd >, < x,>, < y,>, < z >) + endm +vfmaddrnd231sd macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vfmaddrnd231sd >, < x,>, < y,>, < z,>, < imm>) + endm +vfmadd132ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd132ss >, < x,>, < y,>, < z >) + endm +vfmadd213ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd213ss >, < x,>, < y,>, < z >) + endm +vfmadd231ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfmadd231ss >, < x,>, < y,>, < z >) + endm +vfmaddrnd231ss macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vfmaddrnd231ss >, < x,>, < y,>, < z,>, < imm>) + endm +vfmaddsub132pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmaddsub132pd >, < x,>, < y,>, < z >) + endm +vfmaddsub213pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmaddsub213pd >, < x,>, < y,>, < z >) + endm +vfmaddsub231pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmaddsub231pd >, < x,>, < y,>, < z >) + endm +vfmaddsub132ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmaddsub132ps >, < x,>, < y,>, < z >) + endm +vfmaddsub213ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmaddsub213ps >, < x,>, < y,>, < z >) + endm +vfmaddsub231ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmaddsub231ps >, < x,>, < y,>, < z >) + endm +vfmsubadd132pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsubadd132pd >, < x,>, < y,>, < z >) + endm +vfmsubadd213pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsubadd213pd >, < x,>, < y,>, < z >) + endm +vfmsubadd231pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsubadd231pd >, < x,>, < y,>, < z >) + endm +vfmsubadd132ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsubadd132ps >, < x,>, < y,>, < z >) + endm +vfmsubadd213ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsubadd213ps >, < x,>, < y,>, < z >) + endm +vfmsubadd231ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsubadd231ps >, < x,>, < y,>, < z >) + endm +vfmsub132pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub132pd >, < x,>, < y,>, < z >) + endm +vfmsub213pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub213pd >, < x,>, < y,>, < z >) + endm +vfmsub231pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub231pd >, < x,>, < y,>, < z >) + endm +vfmsub132ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub132ps >, < x,>, < y,>, < z >) + endm +vfmsub213ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub213ps >, < x,>, < y,>, < z >) + endm +vfmsub231ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub231ps >, < x,>, < y,>, < z >) + endm +vfmsub132sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub132sd >, < x,>, < y,>, < z >) + endm +vfmsub213sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub213sd >, < x,>, < y,>, < z >) + endm +vfmsub231sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub231sd >, < x,>, < y,>, < z >) + endm +vfmsub132ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub132ss >, < x,>, < y,>, < z >) + endm +vfmsub213ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub213ss >, < x,>, < y,>, < z >) + endm +vfmsub231ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfmsub231ss >, < x,>, < y,>, < z >) + endm +vfnmadd132pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd132pd >, < x,>, < y,>, < z >) + endm +vfnmadd213pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd213pd >, < x,>, < y,>, < z >) + endm +vfnmadd231pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd231pd >, < x,>, < y,>, < z >) + endm +vfnmadd132ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd132ps >, < x,>, < y,>, < z >) + endm +vfnmadd213ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd213ps >, < x,>, < y,>, < z >) + endm +vfnmadd231ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd231ps >, < x,>, < y,>, < z >) + endm +vfnmadd132sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd132sd >, < x,>, < y,>, < z >) + endm +vfnmadd213sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd213sd >, < x,>, < y,>, < z >) + endm +vfnmadd231sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd231sd >, < x,>, < y,>, < z >) + endm +vfnmadd132ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd132ss >, < x,>, < y,>, < z >) + endm +vfnmadd213ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd213ss >, < x,>, < y,>, < z >) + endm +vfnmadd231ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmadd231ss >, < x,>, < y,>, < z >) + endm +vfnmsub132pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub132pd >, < x,>, < y,>, < z >) + endm +vfnmsub213pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub213pd >, < x,>, < y,>, < z >) + endm +vfnmsub231pd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub231pd >, < x,>, < y,>, < z >) + endm +vfnmsub132ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub132ps >, < x,>, < y,>, < z >) + endm +vfnmsub213ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub213ps >, < x,>, < y,>, < z >) + endm +vfnmsub231ps macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub231ps >, < x,>, < y,>, < z >) + endm +vfnmsub132sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub132sd >, < x,>, < y,>, < z >) + endm +vfnmsub213sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub213sd >, < x,>, < y,>, < z >) + endm +vfnmsub231sd macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub231sd >, < x,>, < y,>, < z >) + endm +vfnmsub132ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub132ss >, < x,>, < y,>, < z >) + endm +vfnmsub213ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub213ss >, < x,>, < y,>, < z >) + endm +vfnmsub231ss macro x:req, y:req, z:req + %ECHO @CatStr(<vfnmsub231ss >, < x,>, < y,>, < z >) + endm + +; AVX2 (HSW) + +vpsllvd macro x:req, y:req, z:req + %ECHO @CatStr(<vpsllvd >, < x,>, < y,>, < z >) + endm +vpsllvq macro x:req, y:req, z:req + %ECHO @CatStr(<vpsllvq >, < x,>, < y,>, < z >) + endm +vcvtph2ps macro x:req, z:req + %ECHO @CatStr(<vcvtph2ps >, < x,>, < z >) + endm +andn macro x:req, y:req, z:req + %ECHO @CatStr(<andn >, < x,>, < y,>, < z >) + endm +bextr macro x:req, y:req, z:req + %ECHO @CatStr(<bextr >, < x,>, < y,>, < z >) + endm +blsi macro x:req, z:req + %ECHO @CatStr(<blsi >, < x,>, < z >) + endm +blsmsk macro x:req, z:req + %ECHO @CatStr(<blmsk >, < x,>, < z >) + endm +blsr macro x:req, z:req + %ECHO @CatStr(<blsr >, < x,>, < z >) + endm +bzhi macro x:req, y:req, z:req + %ECHO @CatStr(<bzhi >, < x,>, < y,>, < z >) + endm +;lzcnt macro x:req, z:req +; %ECHO @CatStr(<lzcnt >, < x,>, < z >) +; endm +mulx macro x:req, y:req, z:req + %ECHO @CatStr(<mulx >, < x,>, < y,>, < z >) + endm +pdep macro x:req, y:req, z:req + %ECHO @CatStr(<pdep >, < x,>, < y,>, < z >) + endm +pext macro x:req, y:req, z:req + %ECHO @CatStr(<pext >, < x,>, < y,>, < z >) + endm +rorx macro x:req, y:req, z:req + %ECHO @CatStr(<rorx >, < x,>, < y,>, < z >) + endm +sarx macro x:req, y:req, z:req + %ECHO @CatStr(<sarx >, < x,>, < y,>, < z >) + endm +shlx macro x:req, y:req, z:req + %ECHO @CatStr(<shlx >, < x,>, < y,>, < z >) + endm +shrx macro x:req, y:req, z:req + %ECHO @CatStr(<shrx >, < x,>, < y,>, < z >) + endm +tzcnt macro x:req, z:req + %ECHO @CatStr(<tzcnt >, < x,>, < z >) + endm +invpcid macro x:req, z:req + %ECHO @CatStr(<invpcid >, < x,>, < z >) + endm +rdrand macro x:req + %ECHO @CatStr(<rdrand >, < x >) + endm +rdseed macro x:req + %ECHO @CatStr(<rdseed >, < x >) + endm +adcx macro x:req, z:req + %ECHO @CatStr(<adcx >, < x,>, < z >) + endm +adox macro x:req, z:req + %ECHO @CatStr(<adox >, < x,>, < z >) + endm +;prefetchw macro x:req +; %ECHO @CatStr(<prefetchw >, < x >) +; endm +vpbroadcast macro x:req, y:req, z:req + %ECHO @CatStr(<vpbroadcast >, < x,>, < y,>, < z >) + endm +vpbroadcastb macro x:req, y:req + %ECHO @CatStr(<vpbroadcastb >, <x, >, <y >) +endm +vpbroadcastw macro x:req, y:req + %ECHO @CatStr(<vpbroadcastw >, <x, >, <y >) +endm +vpbroadcastd macro x:req, y:req + %ECHO @CatStr(<vpbroadcastd >, <x, >, <y >) +endm +vpermd macro x:req, y:req, z:req + %ECHO @CatStr(<vpermd >, < x,>, < y,>, < z >) + endm +vpermpd macro x:req, y:req, z:req + %ECHO @CatStr(<vpermpd >, < x,>, < y,>, < z >) + endm +vpermps macro x:req, y:req, z:req + %ECHO @CatStr(<vpermps >, < x,>, < y,>, < z >) + endm +vpermq macro x:req, y:req, z:req + %ECHO @CatStr(<vpermq >, < x,>, < y,>, < z >) + endm +vperm2i128 macro x:req, y:req, z:req + %ECHO @CatStr(<vperm2i128 >, < x,>, < y,>, < z >) + endm +vextracti128 macro x:req, y:req, z:req + %ECHO @CatStr(<vextracti128 >, < x,>, < y,>, < z >) + endm +vinserti128 macro x:req, y:req, z:req, imm:req + %ECHO @CatStr(<vinserti128 >, < x,>, < y,>, < z,>, < imm>) + endm +vpmaskmov macro x:req, y:req, z:req + %ECHO @CatStr(<vpmaskmov >, < x,>, < y,>, < z >) + endm +vpsravd macro x:req, y:req, z:req + %ECHO @CatStr(<vpsravd >, < x,>, < y,>, < z >) + endm +vpsrlvd macro x:req, y:req, z:req + %ECHO @CatStr(<vpsrlvd >, < x,>, < y,>, < z >) + endm +vpsrlvq macro x:req, y:req, z:req + %ECHO @CatStr(<vpsrlvq >, < x,>, < y,>, < z >) + endm +vgatherdpd macro x:req, y:req, z:req + %ECHO @CatStr(<vgatherdpd >, < x,>, < y,>, < z >) + endm +vgatherqpd macro x:req, y:req, z:req + %ECHO @CatStr(<vgatherqpd >, < x,>, < y,>, < z >) + endm +vgatherdps macro x:req, y:req, z:req + %ECHO @CatStr(<vgatherdps >, < x,>, < y,>, < z >) + endm +vgatherqps macro x:req, y:req, z:req + %ECHO @CatStr(<vgatherqps >, < x,>, < y,>, < z >) + endm +vgatherdd macro x:req, y:req, z:req + %ECHO @CatStr(<vgatherdd >, < x,>, < y,>, < z >) + endm +vgatherqd macro x:req, y:req, z:req + %ECHO @CatStr(<vgatherqd >, < x,>, < y,>, < z >) + endm +vgatherdq macro x:req, y:req, z:req + %ECHO @CatStr(<vgatherdq >, < x,>, < y,>, < z >) + endm +vgatherqq macro x:req, y:req, z:req + %ECHO @CatStr(<vgatherqq >, < x,>, < y,>, < z >) + endm +;vpmaddubsw macro x:req, y:req, z:req +; %ECHO @CatStr(<vpmaddubsw >, < x,>, < y,>, < z >) +; endm +;vmpsadbw macro x:req, y:req, z:req +; %ECHO @CatStr(<vmpsadbw >, < x,>, < y,>, < z >) +; endm + +ENDIF ; IFNDEF ML1100 +ENDIF ; IFNDEF ML1200 + +ELSE ; MNI & SNI macro for Linux or for Windows + +IFNDEF ML1100 + + IF IPP_ABI LE 1 + OPTION NOKEYWORD:<pmuludq> + IFHIGH_REG MACRO x, f + f = 0 + FOR y,<xmm8,XMM8,xmm9,XMM9,xmm10,XMM10,xmm11,XMM11,xmm12,XMM12,xmm13,XMM13,xmm14,XMM14,xmm15,XMM15> + IFIDN <y>,<x> + f = 1 + EXITM + ENDIF + ENDM + IF f EQ 0 + FOR y,<r8,R8,r9,R9,r10,R10,r11,R11,r12,R12,r13,R13,r14,R14,r15,R15> + IF @InStr( , x, y ) NE 0 + f = 1 + EXITM + ENDIF + ENDM + ENDIF + ENDM + IFMMX_REG MACRO x, f + f = 0 + FOR y,<mm0,MM0,mm1,MM1,mm2,MM2,mm3,MM3,mm4,MM4,mm5,MM5,mm6,MM6,mm7,MM7> + IFIDN <y>,<x> + f = 1 + EXITM + ENDIF + ENDM + ENDM + + ;;66/REX 0F F4 /r pmuludq xmm1, xmm2/m128 + pmuludq macro dst:req, src:req + local x, y + IFMMX_REG <dst>,f + IF f GT 0 + x: + paddq dst, src + y: + org x+1 + db 0F4h + org y + ELSE + x: + addpd dst, src + y: + IFHIGH_REG <dst>,f + IF f EQ 0 + IFHIGH_REG <src>,f + ENDIF + IF f GT 0 + org x+3 + ELSE + org x+2 + ENDIF + db 0F4h + org y + ENDIF + endm + + ENDIF + +nis_mni = 38h ;new instruction set +nis_mnia = 3Ah ;new instruction set 'a' +reg_mmx = 0Fh ;media registers type +reg_xmm = 66h ;media registers type + +opc_phaddw = 01h +opc_phaddd = 02h +opc_phaddsw = 03h +opc_phsubw = 05h +opc_phsubd = 06h +opc_phsubsw = 07h +opc_pmaddubsw = 04h +opc_pmulhrsw = 0Bh +opc_pshufb = 00h +opc_psignb = 08h +opc_psignw = 09h +opc_psignd = 0Ah +opc_palignr = 0Fh +opc_pabsb = 1Ch +opc_pabsw = 1Dh +opc_pabsd = 1Eh + +HIGHQ_GPR textequ <!<r8,R8,r9,R9,r10,R10,r11,R11,r12,R12,r13,R13,r14,R14,r15,R15!>> +LOWQ_GPR textequ <!<rax,RAX,rcx,RCX,rdx,RDX,rbx,RBX,rsp,RSP,rbp,RBP,rsi,RSI,rdi,RDI!>> +HIGH_XMM textequ <!<xmm8,XMM8,xmm9,XMM9,xmm10,XMM10,xmm11,XMM11,xmm12,XMM12,xmm13,XMM13,xmm14,XMM14,xmm15,XMM15!>> +LOW_XMM textequ <!<xmm0,XMM0,xmm1,XMM1,xmm2,XMM2,xmm3,XMM3,xmm4,XMM4,xmm5,XMM5,xmm6,XMM6,xmm7,XMM7!>> +ALL_MMX textequ <!<mm0,MM0,mm1,MM1,mm2,MM2,mm3,MM3,mm4,MM4,mm5,MM5,mm6,MM6,mm7,MM7!>> +HIGHDQ_GPR textequ <!<R8D,r8d,R8,r8,R9D,r9d,R9,r9,R10D,r10d,R10,r10,R11D,r11d,R11,r11,R12D,r12d,R12,r12,R13D,r13d,R13,r13,R14D,r14d,R14,r14,R15D,r15d,R15,r15!>> +LOWDQ_GPR textequ <!<EAX,eax,RAX,rax,ECX,ecx,RCX,rcx,EDX,edx,RDX,rdx,EBX,ebx,RBX,rbx,ESP,esp,RSP,rsp,EBP,ebp,RBP,rbp,ESI,esi,RSI,rsi,EDI,edi,RDI,rdi!>> +LOWD_GPR textequ <!<eax,EAX,ecx,ECX,edx,EDX,ebx,EBX,esp,ESP,ebp,EBP,esi,ESI,edi,EDI!>> +HIGHD_GPR textequ <!<r8d,R8D,r9d,R9D,r10d,R10D,r11d,R11D,r12d,R12D,r13d,R13D,r14d,R14D,r15d,R15D!>> +LOWW_GPR textequ <!<ax,AX,cx,CX,dx,DX,bx,BX,sp,SP,bp,BP,si,SI,di,DI!>> +HIGHW_GPR textequ <!<r8w,R8W,r9w,R9W,r10w,R10W,r11w,R11W,r12w,R12W,r13w,R13W,r14w,R14W,r15w,R15W!>> +LOWB_GPR textequ <!<al,AL,cl,CL,dl,DL,bl,BL,ah,AH,ch,CH,dh,DH,bh,BH!>> +HIGHB_GPR textequ <!<r8b,R8B,r9b,R9B,r10b,R10B,r11b,R11B,r12b,R12B,r13b,R13B,r14b,R14B,r15b,R15B,spl,SPL,bpl,BPL,sil,SIL,dil,DIL!>> +ALL_NUM textequ <!<15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0!>> + +IS_REX MACRO x, REX + REX = 0 + %FOR yrex,HIGH_XMM ; if xmm from 8-15 range - REX byte is required + IFIDN <yrex>,<x> + REX = 1 + EXITM + ENDIF + ENDM + IF REX EQ 0 + %FOR yrex,HIGHDQ_GPR ; if gpr from 8-15 range - REX byte is required + IF @InStr( , x, yrex ) NE 0 + REX = 1 + EXITM + ENDIF + ENDM + ENDIF +ENDM + +IS_MMX MACRO x, MMX + MMX = 0 + %FOR ymmx,ALL_MMX ; test if operand is a mmx register + IFIDN <ymmx>,<x> + MMX = 1 + EXITM + ENDIF + ENDM +ENDM + +SUBST_GPR MACRO x ; this macro substites any gpr from the high half (8-15) + xretgpr textequ <x> ; with the gpr from the low half wich produces the same + qgpr = 0 ; index in the mod/r/m and sib bytes + %FOR ygpr,HIGHDQ_GPR + posgpr INSTR <x>,<ygpr> + IF posgpr GT 0 + fgpr = 0 + %FOR zgpr,LOWDQ_GPR + IF fgpr EQ qgpr + f1gpr SUBSTR <x>, 1, posgpr-1 + f2gpr SUBSTR <x>, posgpr + @SizeStr( ygpr ) + xretgpr CATSTR <f1gpr>, < zgpr >, <f2gpr> + EXITM xretgpr + ENDIF ; if f == q + fgpr = fgpr + 1 + ENDM ; for z + ENDIF ; if posx > 0 + qgpr = qgpr + 1 + ENDM ; for y + EXITM xretgpr +ENDM + +SUBST_XMM MACRO x ; this macro substites any xmm from the high half (8-15) + xretxmm textequ <x> ; with the xmm from the low half wich produces the same + lxmm = 0 ; index in the mod/r/m byte + %FOR yxmm,HIGH_XMM + posxmm INSTR <x>,<yxmm> + IF posxmm GT 0 + fxmm = 0 + %FOR zxmm,LOW_XMM + IF fxmm EQ lxmm + xretxmm textequ <zxmm> + EXITM xretxmm + ENDIF ; if f == l + fxmm = fxmm + 1 + ENDM ; for z + ENDIF ; if posx > 0 + lxmm = lxmm + 1 + ENDM ; for y + EXITM xretxmm +ENDM + +SUBST_HIGH MACRO x ; a wrapper for macros that substitute up-half registers + xs textequ SUBST_GPR( x ) ; with their ia32 analogues that have the same index in + xs1 textequ SUBST_GPR( %xs ) ; the mod/r/m byte + xs2 textequ SUBST_XMM( %xs1 ) + EXITM xs2 +ENDM + +SUBST_MIMM MACRO x, y ; if "x" contains direct reference to memory operand (by + zimm = ( OPATTR( x )) AND 0011110111y ; name defined in code or data section) it is substituted + IF zimm EQ 0 ; by "y" operand in order to produce right REX byte, but + ximm textequ <y> ; don't produce relocation record (because current address + ELSE ; for relocation due to different instruction length is wrong) + ximm textequ <x> + ENDIF + EXITM ximm +ENDM + +IS_NAME MACRO x ; if "x" contains direct reference to memory operand (by + znam = ( OPATTR( x )) AND 0011110111y ; name defined in code or data section) 1 is returned + IF znam EQ 0 ; else 0 + xnam = 1 + ELSE + xnam = 0 + ENDIF + EXITM %xnam +ENDM + + +mni_instruction macro dst:req, src:req, nis:req, opc:req, imm8 + local x0, x1, x2, x3, x4, x5, x6, x7 + + IS_REX <src>,REX ; do we need REX byte due to src operand? + REXS = REX + IF REXS EQ 1 ; if yes - we have to prepare substitution in order + s1rc textequ SUBST_HIGH( src ) ; to work correctly with direct memory operands + ELSE + s1rc textequ <src> ; else substitution is not required + ENDIF + IS_REX <dst>,REX ; do we need REX byte due to dst operand? + REXD = REX + IF REXD EQ 1 ; if yes - we have to prepare substitution in order + d1st textequ SUBST_HIGH( dst ) ; to work correctly with direct memory operands + ELSE + d1st textequ <dst> ; else substitution is not required + ENDIF + REX = REXS + REXD + NAMS = IS_NAME( src ) + NAMD = IS_NAME( dst ) + isname = NAMS + NAMD + IS_MMX <dst>,MMX ; the same instruction set for both MMX and SSE + IF MMX GT 0 ; we need to separate them because of different length (in bytes) + s2rc textequ SUBST_MIMM( src, mm0 ) + d2st textequ SUBST_MIMM( dst, mm0 ) + IF isname GT 0 ; if src or dst contains direct reference to memory operand + IF REX GT 0 + x0: + nop + nop + pand d1st,s1rc ; 90 90 0F DB /r m32 + x1: + org x0 + pand d2st,s2rc ; REX 0F DB /r /r m32 + org x0+2 + db nis + db opc + IFNB <imm8> + org x0+5 + dd 0FFFFFFFFH + org x1 ; 66 REX 0F nis opc /r m32 + db imm8 + ELSE + org x1 + ENDIF + ELSE + db reg_mmx ; MMX processing + x2: + pand dst, src ; 0F 0F DB /r m32 + x3: + org x2 + db nis + db opc + IFNB <imm8> + org x2+3 + dd 0FFFFFFFFH + org x3 ; 0F nis opc /r m32 + db imm8 + ELSE + org x3 + ENDIF + ENDIF + ELSE ; if src or dst doesn't contain direct reference to memory operand + IF REX GT 0 + x0: + pand dst,src ; REX 0F DB /r + org x0+1 + pand dst,src ; REX REX 0F DB /r + x1: + org x0+1 + db reg_mmx + db nis + db opc + org x1 ; REX reg_mmx nis opc /r + IFNB <imm8> + db imm8 + ENDIF + ELSE + db reg_mmx ; MMX processing + x2: + pand dst, src ; reg_mmx 0F DB /r + x3: + org x2 + db nis + db opc + org x3 ; reg_mmx nis opc /r + IFNB <imm8> + db imm8 + ENDIF + ENDIF + ENDIF + ELSE ; SSE processing + s2rc textequ SUBST_MIMM( src, xmm0 ) + d2st textequ SUBST_MIMM( dst, xmm0 ) + IF isname GT 0 ; if src or dst contains direct reference to memory operand + IF REX GT 0 + db reg_xmm + x4: + nop + nop + mulps d1st,s1rc ; 66 90 90 0F 59 /r m32 + x5: + org x4 + mulps d2st,s2rc ; 66 REX 0F 59 /r /r m32 + org x4+2 + db nis + db opc + IFNB <imm8> + org x4+5 + dd 0FFFFFFFFH + org x5 ; 66 REX 0F nis opc /r m32 + db imm8 + ELSE + org x5 + ENDIF + ELSE + db reg_xmm + x6: + nop + mulps dst, src ; 66 90 0F 59 /r m32 + x7: + org x6 + db reg_mmx + db nis + db opc + IFNB <imm8> + org x6+4 + dd 0FFFFFFFFH + org x7 ; 66 0F nis opc /r m32 + db imm8 + ELSE + org x7 + ENDIF + ENDIF + ELSE ; if src or dst doesn't contain direct reference to memory operand + IF REX GT 0 + db reg_xmm + x4: + mulps dst,src ; 66 REX 0F 59 /r + org x4+1 + mulps dst,src ; 66 REX REX 0F 59 /r + x5: + org x4+1 + db reg_mmx + db nis + db opc + org x5 ; 66 REX 0F nis opc /r + IFNB <imm8> + db imm8 + ENDIF + ELSE + db reg_xmm + x6: + nop + mulps dst, src ; 66 90 0F 59 /r + x7: + org x6 + db reg_mmx + db nis + db opc + org x7 ; 66 0F nis opc /r + IFNB <imm8> + db imm8 + ENDIF + ENDIF + ENDIF + ENDIF +endm + +;IF @Version LT 900 +IFNDEF D_ML900 + +; OPTION NOKEYWORD:<phaddw> +; 0F 38 01 /r phaddw mm1, mm2/m64 +; 66 0F 38 01 /r phaddw xmm1, xmm2/m128 +phaddw macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_phaddw +endm + +; OPTION NOKEYWORD:<phaddd> +; 0F 38 02 /r phaddd mm1, mm2/m64 +; 66 0F 38 02 /r phaddd xmm1, xmm2/m128 +phaddd macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_phaddd +endm + +; OPTION NOKEYWORD:<phaddsw> +; 0F 38 03 /r phaddsw mm1, mm2/m64 +; 66 0F 38 03 /r phaddsw xmm1, xmm2/m128 +phaddsw macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_phaddsw +endm + +; OPTION NOKEYWORD:<phsubw> +; 0F 38 05 /r phsubw mm1, mm2/m64 +; 66 0F 38 05 /r phsubw xmm1, xmm2/m128 +phsubw macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_phsubw +endm + +; OPTION NOKEYWORD:<phsubd> +; 0F 38 06 /r phsubd mm1, mm2/m64 +; 66 0F 38 06 /r phsubd xmm1, xmm2/m128 +phsubd macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_phsubd +endm + +; OPTION NOKEYWORD:<phsubsw> +; 0F 38 07 /r phsubsw mm1, mm2/m64 +; 66 0F 38 07 /r phsubsw xmm1, xmm2/m128 +phsubsw macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_phsubsw +endm + +; OPTION NOKEYWORD:<pmaddubsw> +; 0F 38 04 /r pmaddubsw mm1, mm2/m64 +; 66 0F 38 04 /r pmaddubsw xmm1, xmm2/m128 +pmaddubsw macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_pmaddubsw +endm + +; OPTION NOKEYWORD:<pmulhrsw> +; 0F 38 0B /r pmulhrsw mm1, mm2/m64 +; 66 0F 38 0B /r pmulhrsw xmm1, xmm2/m128 +pmulhrsw macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_pmulhrsw +endm + +; OPTION NOKEYWORD:<pshufb> +; 0F 38 00 /r pshufb mm1, mm2/m64 +; 66 0F 38 00 /r pshufb xmm1, xmm2/m128 +pshufb macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_pshufb +endm + +; OPTION NOKEYWORD:<psignb> +; 0F 38 08 /r psignb mm1, mm2/m64 +; 66 0F 38 08 /r psignb xmm1, xmm2/m128 +psignb macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_psignb +endm + +; OPTION NOKEYWORD:<psignw> +; 0F 38 09 /r psignw mm1, mm2/m64 +; 66 0F 38 09 /r psignw xmm1, xmm2/m128 +psignw macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_psignw +endm + +; OPTION NOKEYWORD:<psignd> +; 0F 38 0A /r psignd mm1, mm2/m64 +; 66 0F 38 0A /r psignd xmm1, xmm2/m128 +psignd macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_psignd +endm + +; OPTION NOKEYWORD:<palignr> +; 0F 3A 0F /r palignr mm1, mm2/m64 +; 66 0F 3A 0F /r palignr xmm1, xmm2/m128 +palignr macro dst:req, src:req, imm8:req + %mni_instruction dst, src, nis_mnia, opc_palignr, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<pabsb> +; 0F 38 1C /r pabsb mm1, mm2/m64 +; 66 0F 38 1C /r pabsb xmm1, xmm2/m128 +pabsb macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_pabsb +endm + +; OPTION NOKEYWORD:<pabsw> +; 0F 38 1D /r pabsw mm1, mm2/m64 +; 66 0F 38 1D /r pabsw xmm1, xmm2/m128 +pabsw macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_pabsw +endm + +; OPTION NOKEYWORD:<pabsd> +; 0F 38 1E /r pabsd mm1, mm2/m64 +; 66 0F 38 1E /r pabsd xmm1, xmm2/m128 +pabsd macro dst:req, src:req + %mni_instruction dst, src, nis_mni, opc_pabsd +endm + +ENDIF +; The End of @Version < 900 + +; SNI (Swing new instructions or SSE4.1) + +nis_sni = 38h ; new instruction set +nis_snia = 3Ah ; new instruction set 'a' (with imm8) + +opc_blendpd = 0Dh +opc_blendps = 0Ch +opc_blendvpd = 15h +opc_blendvps = 14h +opc_dppd = 41h +opc_dpps = 40h +opc_extractps = 17h +opc_insertps = 21h +opc_movntdqa = 2Ah +opc_mpsadbw = 42h +opc_pblendvb = 10h +opc_pblendw = 0Eh +opc_pcmpeqq = 29h +opc_pextrb = 14h +opc_pextrd = 16h +opc_pextrw = 15h +opc_phminposuw = 41h +opc_packusdw = 2Bh +opc_pinsrb = 20h +opc_pinsrd = 22h +opc_pmaxsb = 3Ch +opc_pmaxsd = 3Dh +opc_pmaxud = 3Fh +opc_pmaxuw = 3Eh +opc_pminsb = 38h +opc_pminsd = 39h +opc_pminud = 3Bh +opc_pminuw = 3Ah +opc_pmovsxbw = 20h +opc_pmovsxbd = 21h +opc_pmovsxbq = 22h +opc_pmovsxwd = 23h +opc_pmovsxwq = 24h +opc_pmovsxdq = 25h +opc_pmovzxbw = 30h +opc_pmovzxbd = 31h +opc_pmovzxbq = 32h +opc_pmovzxwd = 33h +opc_pmovzxwq = 34h +opc_pmovzxdq = 35h +opc_pmuldq = 28h +opc_pmulld = 40h +opc_ptest = 17h +opc_roundpd = 09h +opc_roundps = 08h +opc_roundsd = 0Bh +opc_roundss = 0Ah + +sni_instruction macro dst:req, src:req, nis:req, opc:req, imm8 + local x0, x1, x2, x3, x4, x5, x6, x7 + + bracket INSTR <src>,<[> + IF bracket GT 0 + memtype INSTR <src>,<oword> + IF memtype EQ 0 + memtype INSTR <src>,<OWORD> + ENDIF + IF memtype EQ 0 + .ERR <src must contain: oword ptr > + EXITM + ENDIF + ENDIF + bracket INSTR <dst>,<[> + IF bracket GT 0 + memtype INSTR <dst>,<oword> + IF memtype EQ 0 + memtype INSTR <dst>,<OWORD> + ENDIF + IF memtype EQ 0 + .ERR <dst must contain: oword ptr > + EXITM + ENDIF + ENDIF + IS_REX <src>,REX ; do we need REX byte due to src operand? + REXS = REX + IF REXS EQ 1 ; if yes - we have to prepare substitution in order + s1rc textequ SUBST_HIGH( src ) ; to work correctly with direct memory operands + ELSE + s1rc textequ <src> ; else substitution is not required + ENDIF + IS_REX <dst>,REX ; do we need REX byte due to dst operand? + REXD = REX + IF REXD EQ 1 ; if yes - we have to prepare substitution in order + d1st textequ SUBST_HIGH( dst ) ; to work correctly with direct memory operands + ELSE + d1st textequ <dst> ; else substitution is not required + ENDIF + REX = REXS + REXD + NAMS = IS_NAME( src ) ; is there the direct memory operand (defined by name in code + NAMD = IS_NAME( dst ) ; or data section)? if yes - then another algorithm for macro + isname = NAMS + NAMD ; substitution due to bug in ml with relocations definition + s2rc textequ SUBST_MIMM( src, xmm0 ) + d2st textequ SUBST_MIMM( dst, xmm0 ) + IF isname GT 0 ; if src or dst contains direct reference to memory operand + IF REX GT 0 + db reg_xmm + x0: + nop + nop + movaps d1st,s1rc ; 66 90 90 0F 28 /r m32 + x1: + org x0 + movaps d2st,s2rc ; 66 REX 0F 28 /r /r m32 + org x0+2 + db nis + db opc + IFNB <imm8> + org x0+5 + dd 0FFFFFFFFH + org x1 ; 66 REX 0F nis opc /r m32 + db imm8 + ELSE + org x1 + ENDIF + ELSE + db reg_xmm + x2: + nop + movaps dst, src ; 66 90 0F 28 /r m32 + x3: + org x2 + db reg_mmx + db nis + db opc + IFNB <imm8> + org x2+4 + dd 0FFFFFFFFH + org x3 ; 66 0F nis opc /r m32 + db imm8 + ELSE + org x3 + ENDIF + ENDIF + ELSE ; if src or dst doesn't contain direct reference to memory operand + IF REX GT 0 + db reg_xmm + x4: + movaps dst,src ; 66 REX 0F 28 /r + org x4+1 + movaps dst,src ; 66 REX REX 0F 28 /r + x5: + org x4+1 + db reg_mmx + db nis + db opc + org x5 ; 66 REX 0F nis opc /r + IFNB <imm8> + db imm8 + ENDIF + ELSE + db reg_xmm + x6: + nop + movaps dst, src ; 66 90 0F 28 /r + x7: + org x6 + db reg_mmx + db nis + db opc + org x7 ; 66 0F nis opc /r + IFNB <imm8> + db imm8 + ENDIF + ENDIF + ENDIF +endm + +DO_NEED_REX MACRO x, gpr32_64, rexbyte ; test if REX required for pextrw instr (old form) + gpr32_64 = 0 ; gpr32_64 shows what gpr is required for substitution - 32bit or 64bit + rexbyte = 0 ; if REX is required than rexbyte = 1 + %FOR ygpr,HIGHD_GPR + posgpr INSTR <x>,<ygpr> + IF posgpr GT 0 + gpr32_64 = 0 + rexbyte = 1 + EXITM + ENDIF ; if posgpr > 0 + ENDM ; for ygpr + IF rexbyte GT 0 + EXITM + ENDIF + %FOR ygpr,HIGHQ_GPR + posgpr INSTR <x>,<ygpr> + IF posgpr GT 0 + gpr32_64 = 1 + rexbyte = 1 + EXITM + ENDIF ; if posgpr > 0 + ENDM ; for ygpr + IF rexbyte GT 0 + EXITM + ENDIF + %FOR ygpr,LOWQ_GPR + posgpr INSTR <x>,<ygpr> + IF posgpr GT 0 + gpr32_64 = 1 + rexbyte = 1 + EXITM + ENDIF ; if posgpr > 0 + ENDM ; for ygpr +ENDM + +REPLACE_MMX MACRO x, gpr32_64 ; this macro substites any mmx register (in order to use mov r32/64,r32/64 instr) + xretgpr textequ <x> ; with the gpr equivalent (with the same index in mod/r/m byte) for pextrw instr + qgpr = 0 + %FOR ygpr,ALL_MMX + posgpr INSTR <x>,<ygpr> + IF posgpr GT 0 + IF gpr32_64 GT 0 + fgpr = 0 + %FOR zgpr,LOWQ_GPR + IF fgpr EQ qgpr + xretgpr textequ <zgpr> + EXITM xretgpr + ENDIF ; if f == q + fgpr = fgpr + 1 + ENDM ; for z + ELSE ; gpr 32 or 64 + fgpr = 0 + %FOR zgpr,LOWD_GPR + IF fgpr EQ qgpr + xretgpr textequ <zgpr> + EXITM xretgpr + ENDIF ; if f == q + fgpr = fgpr + 1 + ENDM ; for z + ENDIF ; gpr 32 or 64 + ENDIF ; if posx > 0 + qgpr = qgpr + 1 + ENDM ; for y + EXITM xretgpr +ENDM + +REPLACE_XMM MACRO x, gpr32_64 ; this macro substites any xmm register (in order to use mov r32/64,r32/64 instr) + xretgpr textequ <x> ; with the gpr equivalent (with the same index in mod/r/m byte) for extr/insr instr + yesfound = 0 + qgpr = 0 + %FOR ygpr,LOW_XMM + IFIDN <ygpr>,<x> + IF gpr32_64 GT 0 + fgpr = 0 + %FOR zgpr,LOWQ_GPR + IF fgpr EQ qgpr + xretgpr textequ <zgpr> + yesfound = 1 + EXITM xretgpr + ENDIF ; if f == q + fgpr = fgpr + 1 + ENDM ; for z + ELSE ; gpr 32 or 64 + fgpr = 0 + %FOR zgpr,LOWD_GPR + IF fgpr EQ qgpr + xretgpr textequ <zgpr> + yesfound = 1 + EXITM xretgpr + ENDIF ; if f == q + fgpr = fgpr + 1 + ENDM ; for z + ENDIF ; gpr 32 or 64 + ENDIF ; if posx > 0 + qgpr = qgpr + 1 + ENDM ; for y + IF yesfound GT 0 + EXITM xretgpr + ENDIF + qgpr = 0 + %FOR ygpr,HIGH_XMM + IFIDN <ygpr>,<x> + IF gpr32_64 GT 0 + fgpr = 0 + %FOR zgpr,HIGHQ_GPR + IF fgpr EQ qgpr + xretgpr textequ <zgpr> + yesfound = 1 + EXITM xretgpr + ENDIF ; if f == q + fgpr = fgpr + 1 + ENDM ; for z + ELSE ; gpr 32 or 64 + fgpr = 0 + %FOR zgpr,HIGHD_GPR + IF fgpr EQ qgpr + xretgpr textequ <zgpr> + yesfound = 1 + EXITM xretgpr + ENDIF ; if f == q + fgpr = fgpr + 1 + ENDM ; for z + ENDIF ; gpr 32 or 64 + ENDIF ; if posx > 0 + qgpr = qgpr + 1 + ENDM ; for y + EXITM xretgpr +ENDM + +sni_instr_gpr_new macro dst:req, src:req, nis:req, opc:req, imm8 + local x1, y1, x2, y2 + gpr32_64_d = 0 ; 32-bit or 64-bit form is used? + rexbyte_d = 0 + gpr32_64_s = 0 ; 32-bit or 64-bit form is used? + rexbyte_s = 0 + DO_NEED_REX dst, gpr32_64_d, rexbyte_d ; test for if REX byte is required + IS_REX <dst>,REX ; do we need REX byte due to dst operand? + REXD = REX + rexbyte_d + DO_NEED_REX src, gpr32_64_s, rexbyte_s ; test for if REX byte is required + IS_REX <src>,REX ; do we need REX byte due to dst operand? + REXS = REX + rexbyte_s + REX = REXS + REXD + gpr32_64 = gpr32_64_s + gpr32_64_d + s2rc textequ REPLACE_XMM( src, gpr32_64 ) ; substite src xmm register with gpr that has the same index in mod/r/m byte + d2st textequ REPLACE_XMM( dst, gpr32_64 ) ; substite dst xmm register with gpr that has the same index in mod/r/m byte + IF REX GT 0 + db 66h + x1: +;%echo @CatStr( <r in d1= >,<dst>,< s1=>,<src>) +;%echo @CatStr( <r out d1= >,<d2st>,< s1=>,<s2rc>) + mov d2st, s2rc ; 66 REX 8B /r + org x1+2 + mov d2st, s2rc ; 66 REX 8B REX 8B /r + y1: + org x1+1 + db 0Fh + db nis + db opc ; 66 REX 0F nis opc /r + org y1 + ELSE + db 66h + db 0Fh + db nis + x2: +;%echo @CatStr( <nr in d1= >,<dst>,< s1=>,<src>) +;%echo @CatStr( <nr out d1= >,<d2st>,< s1=>,<s2rc>) + mov d2st, s2rc ; 66 0F nis 8B /r + y2: + org x2 + db opc ; 66 0F nis opc /r + org y2 + ENDIF + db imm8 ; 66 <REX> 0F nis opc /r +endm + +IS_GPRDQ MACRO x, GPRDQ + GPRDQ = 0 + %FOR ygprdq,HIGHDQ_GPR + IF @InStr( , x, ygprdq ) NE 0 + GPRDQ = 1 + EXITM + ENDIF + ENDM + IF GPRDQ EQ 0 + %FOR ygprdq,LOWDQ_GPR + IF @InStr( , x, ygprdq ) NE 0 + GPRDQ = 1 + EXITM + ENDIF + ENDM + ENDIF +ENDM + +IS_XMMALL MACRO x, GPRDQ + GPRDQ = 0 + %FOR yxmmall,HIGH_XMM + IFIDN <yxmmall>,<x> + GPRDQ = 1 + EXITM + ENDIF + ENDM + IF GPRDQ EQ 0 + %FOR yxmmall,LOW_XMM + IFIDN <yxmmall>,<x> + GPRDQ = 1 + EXITM + ENDIF + ENDM + ENDIF +ENDM + +sni_instr_src_m_gpr macro dst:req, src:req, nis:req, opc:req, mem:req, imm8 + IFIDN <mem>,<m8> + memlc textequ <byte> + memuc textequ <BYTE> + ENDIF + IFIDN <mem>,<m16> + memlc textequ <word> + memuc textequ <WORD> + ENDIF + IFIDN <mem>,<m32> + memlc textequ <dword> + memuc textequ <DWORD> + ENDIF + IFIDN <mem>,<m64> + memlc textequ <qword> + memuc textequ <QWORD> + ENDIF + src_dup textequ <src> + bracket INSTR <src>,<[> + IF bracket EQ 0 + bracket INSTR <src>,<ptr> + ENDIF + IF bracket EQ 0 + bracket INSTR <src>,<PTR> + ENDIF + IF bracket GT 0 + memtype INSTR <src>,memlc + IF memtype EQ 0 + memtype INSTR <src>,memuc + ENDIF + IF memtype GT 0 + f1mem SUBSTR <src>, 1, memtype - 1 + f2mem SUBSTR <src>, memtype + @SizeStr( memlc ) + src_dup CATSTR <f1mem>, < oword >, <f2mem> + sni_instruction dst, %src_dup, nis, opc, imm8 + ELSE + .ERR <must be: &memlc ptr > + EXITM + ENDIF + ELSE + IS_GPRDQ src, GPRDQ + IF GPRDQ EQ 0 + .ERR <bad source operand> + ELSE + sni_instr_gpr_new dst, src, nis, opc, imm8 + ENDIF + ENDIF +endm + +sni_instr_src_m_xmm macro dst:req, src:req, nis:req, opc:req, mem:req, imm8 + IFIDN <mem>,<m8> + memlc textequ <byte> + memuc textequ <BYTE> + ENDIF + IFIDN <mem>,<m16> + memlc textequ <word> + memuc textequ <WORD> + ENDIF + IFIDN <mem>,<m32> + memlc textequ <dword> + memuc textequ <DWORD> + ENDIF + IFIDN <mem>,<m64> + memlc textequ <qword> + memuc textequ <QWORD> + ENDIF + src_dup textequ <src> + bracket INSTR <src>,<[> + IF bracket EQ 0 + bracket INSTR <src>,<ptr> + ENDIF + IF bracket EQ 0 + bracket INSTR <src>,<PTR> + ENDIF + IF bracket GT 0 + memtype INSTR <src>,memlc + IF memtype EQ 0 + memtype INSTR <src>,memuc + ENDIF + IF memtype GT 0 + f1mem SUBSTR <src>, 1, memtype - 1 + f2mem SUBSTR <src>, memtype + @SizeStr( memlc ) + src_dup CATSTR <f1mem>, < oword >, <f2mem> + sni_instruction dst, %src_dup, nis, opc, imm8 + ELSE + .ERR <must be: &memlc ptr > + EXITM + ENDIF + ELSE + IS_XMMALL src, GPRDQ + IF GPRDQ EQ 0 + .ERR <bad source operand> + ELSE + sni_instruction dst, src, nis, opc, imm8 + ENDIF + ENDIF +endm + +sni_instr_dst_m_gpr macro dst:req, src:req, nis:req, opc:req, mem:req, imm8 + IFIDN <mem>,<m8> + memlc textequ <byte> + memuc textequ <BYTE> + ENDIF + IFIDN <mem>,<m16> + memlc textequ <word> + memuc textequ <WORD> + ENDIF + IFIDN <mem>,<m32> + memlc textequ <dword> + memuc textequ <DWORD> + ENDIF + IFIDN <mem>,<m64> + memlc textequ <qword> + memuc textequ <QWORD> + ENDIF + dst_dup textequ <dst> + bracket INSTR <dst>,<[> + IF bracket EQ 0 + bracket INSTR <dst>,<ptr> + ENDIF + IF bracket EQ 0 + bracket INSTR <dst>,<PTR> + ENDIF + IF bracket GT 0 + memtype INSTR <dst>,memlc + IF memtype EQ 0 + memtype INSTR <dst>,memuc + ENDIF + IF memtype GT 0 + f1mem SUBSTR <dst>, 1, memtype - 1 + f2mem SUBSTR <dst>, memtype + @SizeStr( memlc ) + dst_dup CATSTR <f1mem>, < oword >, <f2mem> + sni_instruction src, %dst_dup, nis, opc, imm8 + ELSE + .ERR <must be: &memlc ptr > + EXITM + ENDIF + ELSE + IS_GPRDQ dst, GPRDQ + IF GPRDQ EQ 0 + .ERR <bad destination operand> + ELSE + sni_instr_gpr_new src, dst, nis, opc, imm8 + ENDIF + ENDIF +endm + +;IF @Version LT 900 +IFNDEF D_ML900 + +; OPTION NOKEYWORD:<blendpd> +; 66 0F 3A 0D blendpd xmm1, xmm2/m128, imm8 +blendpd macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_snia, opc_blendpd, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<blendps> +; 66 0F 3A 0C blendps xmm1, xmm2/m128, imm8 +blendps macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_snia, opc_blendps, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<blendvpd> +; 66 0F 38 15 blendvpd xmm1, xmm2/m128, XMM0 +blendvpd macro dst:req, src:req, z + %sni_instruction dst, src, nis_sni, opc_blendvpd +endm + +; OPTION NOKEYWORD:<blendvps> +; 66 0F 38 14 blendvps xmm1, xmm2/m128, XMM0 +blendvps macro dst:req, src:req, z + %sni_instruction dst, src, nis_sni, opc_blendvps +endm + +; OPTION NOKEYWORD:<dppd> +; 66 0F 3A 41 dppd xmm1, xmm2/m128, imm8 +dppd macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_snia, opc_dppd, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<dpps> +; 66 0F 3A 40 dpps xmm1, xmm2/m128, imm8 +dpps macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_snia, opc_dpps, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<extractps> +; 66 0F 3A 17 extractps r/m32, xmm2, imm8 +extractps macro dst:req, src:req, imm8:req + %sni_instr_dst_m_gpr dst, src, nis_snia, opc_extractps, m32, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<insertps> +; 66 0F 3A 21 insertps xmm1, xmm2/m32, imm8 +insertps macro dst:req, src:req, imm8:req + %sni_instr_src_m_xmm dst, src, nis_snia, opc_insertps, m32, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<movntdqa> +; 66 0F 38 2A movntdqa xmm1, m128 +movntdqa macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_movntdqa +endm + +; OPTION NOKEYWORD:<mpsadbw> +; 66 0F 3A 42 mpsadbw xmm1, xmm2/m32, imm8 +mpsadbw macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_snia, opc_mpsadbw, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<packusdw> +; 66 0F 38 2B packusdw xmm1, xmm2/m128 +packusdw macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_packusdw +endm + +; OPTION NOKEYWORD:<pblendvb> +; 66 0F 38 10 pblendvb xmm1, xmm2/m128, XMM0 +pblendvb macro dst:req, src:req, z + %sni_instruction dst, src, nis_sni, opc_pblendvb +endm + +; OPTION NOKEYWORD:<pblendw> +; 66 0F 3A 0E pblendw xmm1, xmm2/m128, imm8 +pblendw macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_snia, opc_pblendw, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<pcmpeqq> +; 66 0F 38 29 pcmpeqq xmm1, xmm2/m128 +pcmpeqq macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_pcmpeqq +endm + +; OPTION NOKEYWORD:<pextrb> +; 66 0F 3A 14 pextrb r32/m8, xmm2, imm8 +pextrb macro dst:req, src:req, imm8:req + %sni_instr_dst_m_gpr dst, src, nis_snia, opc_pextrb, m8, imm8 +; db imm8 +endm + + +IF _IPP32E GE _IPP32E_Y8 + + + OPTION NOKEYWORD:<pextrw> +; 66 0F 3A 15 pextrw r32/m16, xmm2, imm8 +pextrw macro dst:req, src:req, imm8:req + local x1, y1, x2, y2 + IFMMX_REG src, f ; if mmx register - old (P4) coding should be used + IF f GT 0 + gpr32_64 = 0 ; 32-bit or 64-bit form is used? + rexbyte = 0 + DO_NEED_REX dst, gpr32_64, rexbyte ; test for if REX byte is required + s2rc textequ REPLACE_MMX( src, gpr32_64 ) ; substite source mmx register with gpr that has the same index in mod/r/m byte + IF rexbyte GT 0 + x1: + mov dst, s2rc ; REX 8B /r + org x1+1 + mov dst, s2rc ; REX REX 8B /r + y1: + org x1+1 + db 0Fh + db 0C5h ; REX 0F C5 /r + org y1 + ELSE + x2: + nop + mov dst, s2rc ; 90 8B /r + y2: + org x2 + db 0Fh + db 0C5h ; 0F C5 /r + org y2 + ENDIF + db imm8 ; 0F C5 /r imm8 + ELSE + %sni_instr_dst_m_gpr dst, src, nis_snia, opc_pextrw, m16, imm8 + ENDIF +endm +ENDIF + +; OPTION NOKEYWORD:<pextrd> +; 66 0F 3A 16 pextrd r32/m32, xmm2, imm8 +pextrd macro dst:req, src:req, imm8:req + %sni_instr_dst_m_gpr dst, src, nis_snia, opc_pextrd, m32, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<pextrq> +; 66 REX 0F 3A 16 pextrq r64/m64, xmm2, imm8 +pextrq macro dst:req, src:req, imm8:req + %sni_instr_dst_m_gpr dst, src, nis_snia, opc_pextrd, m64, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<phminposuw> +; 66 0F 38 41 phminposuw xmm1, xmm2/m128 +phminposuw macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_phminposuw +endm + +; OPTION NOKEYWORD:<pinsrb> +; 66 0F 3A 20 pinsrb xmm1, r32/m8, imm8 +pinsrb macro dst:req, src:req, imm8:req + %sni_instr_src_m_gpr dst, src, nis_snia, opc_pinsrb, m8, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<pinsrd> +; 66 0F 3A 22 pinsrd xmm1, r32/m32, imm8 +pinsrd macro dst:req, src:req, imm8:req + %sni_instr_src_m_gpr dst, src, nis_snia, opc_pinsrd, m32, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<pinsrq> +; 66 REX 0F 3A 22 pinsrq xmm1, r64/m64, imm8 +pinsrq macro dst:req, src:req, imm8:req + %sni_instr_src_m_gpr dst, src, nis_snia, opc_pinsrd, m64, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<pmaxsb> +; 66 0F 38 3C pmaxsb xmm1, xmm2/m128 +pmaxsb macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_pmaxsb +endm + +; OPTION NOKEYWORD:<pmaxsd> +; 66 0F 38 3D pmaxsd xmm1, xmm2/m128 +pmaxsd macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_pmaxsd +endm + +; OPTION NOKEYWORD:<pmaxud> +; 66 0F 38 3F pmaxud xmm1, xmm2/m128 +pmaxud macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_pmaxud +endm + +; OPTION NOKEYWORD:<pmaxuw> +; 66 0F 38 3E pmaxuw xmm1, xmm2/m128 +pmaxuw macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_pmaxuw +endm + +; OPTION NOKEYWORD:<pminsb> +; 66 0F 38 38 pminsb xmm1, xmm2/m128 +pminsb macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_pminsb +endm + +; OPTION NOKEYWORD:<pminsd> +; 66 0F 38 39 pminsd xmm1, xmm2/m128 +pminsd macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_pminsd +endm + +; OPTION NOKEYWORD:<pminud> +; 66 0F 38 3B pminud xmm1, xmm2/m128 +pminud macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_pminud +endm + +; OPTION NOKEYWORD:<pminuw> +; 66 0F 38 3A pminuw xmm1, xmm2/m128 +pminuw macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_pminuw +endm + +; OPTION NOKEYWORD:<pmovsxbw> +; 66 0F 38 20 pmovsxbw xmm1, xmm2/m64 +pmovsxbw macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovsxbw, m64 +endm + +; OPTION NOKEYWORD:<pmovsxbd> +; 66 0F 38 21 pmovsxbd xmm1, xmm2/m32 +pmovsxbd macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovsxbd, m32 +endm + +; OPTION NOKEYWORD:<pmovsxbq> +; 66 0F 38 22 pmovsxbq xmm1, xmm2/m16 +pmovsxbq macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovsxbq, m16 +endm + +; OPTION NOKEYWORD:<pmovsxwd> +; 66 0F 38 23 pmovsxwd xmm1, xmm2/m64 +pmovsxwd macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovsxwd, m64 +endm + +; OPTION NOKEYWORD:<pmovsxwq> +; 66 0F 38 24 pmovsxwq xmm1, xmm2/m32 +pmovsxwq macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovsxwq, m32 +endm + +; OPTION NOKEYWORD:<pmovsxdq> +; 66 0F 38 25 pmovsxdq xmm1, xmm2/m64 +pmovsxdq macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovsxdq, m64 +endm + +; OPTION NOKEYWORD:<pmovzxbw> +; 66 0F 38 30 pmovzxbw xmm1, xmm2/m64 +pmovzxbw macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovzxbw, m64 +endm + +; OPTION NOKEYWORD:<pmovzxbd> +; 66 0F 38 31 pmovzxbd xmm1, xmm2/m32 +pmovzxbd macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovzxbd, m32 +endm + +; OPTION NOKEYWORD:<pmovzxbq> +; 66 0F 38 32 pmovzxbq xmm1, xmm2/m16 +pmovzxbq macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovzxbq, m16 +endm + +; OPTION NOKEYWORD:<pmovzxwd> +; 66 0F 38 33 pmovzxwd xmm1, xmm2/m64 +pmovzxwd macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovzxwd, m64 +endm + +; OPTION NOKEYWORD:<pmovzxwq> +; 66 0F 38 34 pmovzxwq xmm1, xmm2/m32 +pmovzxwq macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovzxwq, m32 +endm + +; OPTION NOKEYWORD:<pmovzxdq> +; 66 0F 38 35 pmovzxdq xmm1, xmm2/m64 +pmovzxdq macro dst:req, src:req + %sni_instr_src_m_xmm dst, src, nis_sni, opc_pmovzxdq, m64 +endm + +; OPTION NOKEYWORD:<pmuldq> +; 66 0F 38 28 pmuldq xmm1, xmm2/m128 +pmuldq macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_pmuldq +endm + +; OPTION NOKEYWORD:<pmulld> +; 66 0F 38 40 pmulld xmm1, xmm2/m128 +pmulld macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_pmulld +endm + +; OPTION NOKEYWORD:<ptest> +; 66 0F 38 17 ptest xmm1, xmm2/m128 +ptest macro dst:req, src:req + %sni_instruction dst, src, nis_sni, opc_ptest +endm + +; OPTION NOKEYWORD:<roundpd> +; 66 0F 3A 09 roundpd xmm1, xmm2/m128, imm8 +roundpd macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_snia, opc_roundpd, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<roundps> +; 66 0F 3A 08 roundps xmm1, xmm2/m128, imm8 +roundps macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_snia, opc_roundps, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<roundsd> +; 66 0F 3A 0B roundsd xmm1, xmm2/m64, imm8 +roundsd macro dst:req, src:req, imm8:req + %sni_instr_src_m_xmm dst, src, nis_snia, opc_roundsd, m64, imm8 +; db imm8 +endm + +; OPTION NOKEYWORD:<roundss> +; 66 0F 3A 0A roundss xmm1, xmm2/m32, imm8 +roundss macro dst:req, src:req, imm8:req + %sni_instr_src_m_xmm dst, src, nis_snia, opc_roundss, m32, imm8 +; db imm8 +endm + +; STTNI (SSE4.2) + +nis_sttni = 38h ; new instruction set +nis_sttnia = 3Ah ; new instruction set 'a' (with imm8) + +opc_pcmpestri = 61h +opc_pcmpestrm = 60h +opc_pcmpistri = 63h +opc_pcmpistrm = 62h +opc_pcmpgtq = 37h +opc_crc32_m8 = 0F0h +opc_crc32 = 0F1h + +; 66 0F 3A 61 pcmpestri xmm1, xmm2/m128, imm8 +pcmpestri macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_sttnia, opc_pcmpestri, imm8 +endm + +; 66 0F 3A 60 pcmpestrm xmm1, xmm2/m128, imm8 +pcmpestrm macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_sttnia, opc_pcmpestrm, imm8 +endm + +; 66 0F 3A 63 pcmpistri xmm1, xmm2/m128, imm8 +pcmpistri macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_sttnia, opc_pcmpistri, imm8 +endm + +; 66 0F 3A 62 pcmpistrm xmm1, xmm2/m128, imm8 +pcmpistrm macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_sttnia, opc_pcmpistrm, imm8 +endm + +; 66 0F 38 37 pcmpgtq xmm1, xmm2/m128 +pcmpgtq macro dst:req, src:req + %sni_instruction dst, src, nis_sttni, opc_pcmpgtq +endm + + +; WSM (AES NI) + +opc_aesenc = 0DCh +opc_aesenclast = 0DDh +opc_aesdec = 0DEh +opc_aesdeclast = 0DFh +opc_aesimc = 0DBh +opc_aeskeygenassist = 0DFh +opc_pclmulqdq = 044h + +; 66 0F 38 DC aesenc xmm1, xmm2/m128 +aesenc macro dst:req, src:req + %sni_instruction dst, src, nis_sttni, opc_aesenc +endm + +; 66 0F 38 DD aesenclast xmm1, xmm2/m128 +aesenclast macro dst:req, src:req + %sni_instruction dst, src, nis_sttni, opc_aesenclast +endm + +; 66 0F 38 DE aesdec xmm1, xmm2/m128 +aesdec macro dst:req, src:req + %sni_instruction dst, src, nis_sttni, opc_aesdec +endm + +; 66 0F 38 DF aesdeclast xmm1, xmm2/m128 +aesdeclast macro dst:req, src:req + %sni_instruction dst, src, nis_sttni, opc_aesdeclast +endm + +; 66 0F 38 DB aesimc xmm1, xmm2/m128 +aesimc macro dst:req, src:req + %sni_instruction dst, src, nis_sttni, opc_aesimc +endm + +; 66 0F 3A DF aeskeygenassist xmm1, xmm2/m128, imm8 +aeskeygenassist macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_sttnia, opc_aeskeygenassist, imm8 +endm + +; 66 0F 3A 44 pclmulqdq xmm1, xmm2/m128, imm8 +pclmulqdq macro dst:req, src:req, imm8:req + %sni_instruction dst, src, nis_sttnia, opc_pclmulqdq, imm8 +endm + +ENDIF + +; AVX 2.0 NI + +get3rdbyte MACRO reg:req, opc3:req + IS_XMMALL reg, x + IF x EQ 0 + opc3 = 085H + ELSE + opc3 = 081H + ENDIF + %FOR num,ALL_NUM + IF @InStr( , reg, num ) NE 0 + EXITM + ENDIF + opc3 = opc3 + 8 + ENDM +endm + +avx20_double MACRO op1:req, op2:req, op3:req, opc:req + local x0, x1 + x0: + vpermilpd op1, op2, op3 + x1: + org x0+2 + get3rdbyte <op2>, opc3 + db opc3 + db opc + org x1 +endm + +avx20_float MACRO op1:req, op2:req, op3:req, opc:req +local x0, x1 + x0: + vpermilps op1, op2, op3 + x1: + org x0+3 + db opc + org x1 +endm + +; VEX.DDS.128/256.66.0F38.W1 98 /r VFMADD132PD xmm0, xmm1, xmm2/m128 +vfmadd132pd macro x:req, y:req, z:req + %avx20_double x, y, z, 98H +endm +; VEX.DDS.128/256.66.0F38.W1 A8 /r VFMADD213PD xmm0, xmm1, xmm2/m128 +vfmadd213pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0A8H +endm +; VEX.DDS.128/256.66.0F38.W1 B8 /r VFMADD231PD xmm0, xmm1, xmm2/m128 +vfmadd231pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0B8H +endm +; VEX.DDS.128/256.66.0F38.W0 98 /r VFMADD132PS xmm0, xmm1, xmm2/m128 +vfmadd132ps macro x:req, y:req, z:req + %avx20_float x, y, z, 98H +endm +; VEX.DDS.128/256.66.0F38.W0 A8 /r VFMADD213PS xmm0, xmm1, xmm2/m128 +vfmadd213ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0A8H +endm +; VEX.DDS.128/256.66.0F38.W0 B8 /r VFMADD231PS xmm0, xmm1, xmm2/m128 +vfmadd231ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0B8H +endm + +; VEX.DDS.128/256.66.0F38.W1 99 /r VFMADD132SD xmm0, xmm1, xmm2/m128 +vfmadd132sd macro x:req, y:req, z:req + %avx20_double x, y, z, 99H +endm +; VEX.DDS.128/256.66.0F38.W1 A9 /r VFMADD213SD xmm0, xmm1, xmm2/m128 +vfmadd213sd macro x:req, y:req, z:req + %avx20_double x, y, z, 0A9H +endm +; VEX.DDS.128/256.66.0F38.W1 B9 /r VFMADD231SD xmm0, xmm1, xmm2/m128 +vfmadd231sd macro x:req, y:req, z:req + %avx20_double x, y, z, 0B9H +endm + +; VEX.DDS.128/256.66.0F38.W0 99 /r VFMADD132SS xmm0, xmm1, xmm2/m128 +vfmadd132ss macro x:req, y:req, z:req + %avx20_float x, y, z, 99H +endm +; VEX.DDS.128/256.66.0F38.W0 A9 /r VFMADD213SS xmm0, xmm1, xmm2/m128 +vfmadd213ss macro x:req, y:req, z:req + %avx20_float x, y, z, 0A9H +endm +; VEX.DDS.128/256.66.0F38.W0 B9 /r VFMADD231SS xmm0, xmm1, xmm2/m128 +vfmadd231ss macro x:req, y:req, z:req + %avx20_float x, y, z, 0B9H +endm + +; VEX.DDS.128/256.66.0F38.W1 96 /r VFMADDSUB132PD xmm0, xmm1, xmm2/m128 +vfmaddsub132pd macro x:req, y:req, z:req + %avx20_double x, y, z, 96H +endm +; VEX.DDS.128/256.66.0F38.W1 A6 /r VFMADDSUB213PD xmm0, xmm1, xmm2/m128 +vfmaddsub213pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0A6H +endm +; VEX.DDS.128/256.66.0F38.W1 B6 /r VFMADDSUB231PD xmm0, xmm1, xmm2/m128 +vfmaddsub231pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0B6H +endm + +; VEX.DDS.128/256.66.0F38.W0 96 /r VFMADDSUB132PS xmm0, xmm1, xmm2/m128 +vfmaddsub132ps macro x:req, y:req, z:req + %avx20_float x, y, z, 96H +endm +; VEX.DDS.128/256.66.0F38.W0 A6 /r VFMADDSUB213PS xmm0, xmm1, xmm2/m128 +vfmaddsub213ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0A6H +endm +; VEX.DDS.128/256.66.0F38.W0 B6 /r VFMADDSUB231PS xmm0, xmm1, xmm2/m128 +vfmaddsub231ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0B6H +endm + +; VEX.DDS.128/256.66.0F38.W1 97 /r VFMSUBADD132PD xmm0, xmm1, xmm2/m128 +vfmsubadd132pd macro x:req, y:req, z:req + %avx20_double x, y, z, 97H +endm +; VEX.DDS.128/256.66.0F38.W1 A7 /r VFMSUBADD213PD xmm0, xmm1, xmm2/m128 +vfmsubadd213pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0A7H +endm +; VEX.DDS.128/256.66.0F38.W1 B7 /r VFMSUBADD231PD xmm0, xmm1, xmm2/m128 +vfmsubadd231pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0B7H +endm + +; VEX.DDS.128/256.66.0F38.W0 97 /r VFMSUBADD132PS xmm0, xmm1, xmm2/m128 +vfmsubadd132ps macro x:req, y:req, z:req + %avx20_float x, y, z, 97H +endm +; VEX.DDS.128/256.66.0F38.W0 A7 /r VFMSUBADD213PS xmm0, xmm1, xmm2/m128 +vfmsubadd213ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0A7H +endm +; VEX.DDS.128/256.66.0F38.W0 B7 /r VFMSUBADD231PS xmm0, xmm1, xmm2/m128 +vfmsubadd231ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0B7H +endm + +; VEX.DDS.128/256.66.0F38.W1 9A /r VFMSUB132PD xmm0, xmm1, xmm2/m128 +vfmsub132pd macro x:req, y:req, z:req + %avx20_double x, y, z, 9AH +endm +; VEX.DDS.128/256.66.0F38.W1 AA /r VFMSUB213PD xmm0, xmm1, xmm2/m128 +vfmsub213pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0AAH +endm +; VEX.DDS.128/256.66.0F38.W1 BA /r VFMSUB231PD xmm0, xmm1, xmm2/m128 +vfmsub231pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0BAH +endm + +; VEX.DDS.128/256.66.0F38.W0 9A /r VFMSUB132PS xmm0, xmm1, xmm2/m128 +vfmsub132ps macro x:req, y:req, z:req + %avx20_float x, y, z, 9AH +endm +; VEX.DDS.128/256.66.0F38.W0 AA /r VFMSUB213PS xmm0, xmm1, xmm2/m128 +vfmsub213ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0AAH +endm +; VEX.DDS.128/256.66.0F38.W0 BA /r VFMSUB231PS xmm0, xmm1, xmm2/m128 +vfmsub231ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0BAH +endm + +; VEX.DDS.128/256.66.0F38.W1 9B /r VFMSUB132SD xmm0, xmm1, xmm2/m128 +vfmsub132sd macro x:req, y:req, z:req + %avx20_double x, y, z, 9BH +endm +; VEX.DDS.128/256.66.0F38.W1 AB /r VFMSUB213SD xmm0, xmm1, xmm2/m128 +vfmsub213sd macro x:req, y:req, z:req + %avx20_double x, y, z, 0ABH +endm +; VEX.DDS.128/256.66.0F38.W1 BB /r VFMSUB231SD xmm0, xmm1, xmm2/m128 +vfmsub231sd macro x:req, y:req, z:req + %avx20_double x, y, z, 0BBH +endm + +; VEX.DDS.128/256.66.0F38.W0 9B /r VFMSUB132SS xmm0, xmm1, xmm2/m128 +vfmsub132ss macro x:req, y:req, z:req + %avx20_float x, y, z, 9BH +endm +; VEX.DDS.128/256.66.0F38.W0 AB /r VFMSUB213SS xmm0, xmm1, xmm2/m128 +vfmsub213ss macro x:req, y:req, z:req + %avx20_float x, y, z, 0ABH +endm +; VEX.DDS.128/256.66.0F38.W0 BB /r VFMSUB231SS xmm0, xmm1, xmm2/m128 +vfmsub231ss macro x:req, y:req, z:req + %avx20_float x, y, z, 0BBH +endm + +; VEX.DDS.128/256.66.0F38.W1 9C /r VFNMADD132PD xmm0, xmm1, xmm2/m128 +vfnmadd132pd macro x:req, y:req, z:req + %avx20_double x, y, z, 9CH +endm +; VEX.DDS.128/256.66.0F38.W1 AC /r VFNMADD213PD xmm0, xmm1, xmm2/m128 +vfnmadd213pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0ACH +endm +; VEX.DDS.128/256.66.0F38.W1 BC /r VFNMADD231PD xmm0, xmm1, xmm2/m128 +vfnmadd231pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0BCH +endm +; VEX.DDS.128/256.66.0F38.W0 9C /r VFNMADD132PS xmm0, xmm1, xmm2/m128 +vfnmadd132ps macro x:req, y:req, z:req + %avx20_float x, y, z, 9CH +endm +; VEX.DDS.128/256.66.0F38.W0 AC /r VFNMADD213PS xmm0, xmm1, xmm2/m128 +vfnmadd213ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0ACH +endm +; VEX.DDS.128/256.66.0F38.W0 BC /r VFNMADD231PS xmm0, xmm1, xmm2/m128 +vfnmadd231ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0BCH +endm + +; VEX.DDS.128/256.66.0F38.W1 9D /r VFNMADD132SD xmm0, xmm1, xmm2/m128 +vfnmadd132sd macro x:req, y:req, z:req + %avx20_double x, y, z, 9DH +endm +; VEX.DDS.128/256.66.0F38.W1 AD /r VFNMADD213SD xmm0, xmm1, xmm2/m128 +vfnmadd213sd macro x:req, y:req, z:req + %avx20_double x, y, z, 0ADH +endm +; VEX.DDS.128/256.66.0F38.W1 BD /r VFNMADD231SD xmm0, xmm1, xmm2/m128 +vfnmadd231sd macro x:req, y:req, z:req + %avx20_double x, y, z, 0BDH +endm + +; VEX.DDS.128/256.66.0F38.W0 9D /r VFNMADD132SS xmm0, xmm1, xmm2/m128 +vfnmadd132ss macro x:req, y:req, z:req + %avx20_float x, y, z, 9DH +endm +; VEX.DDS.128/256.66.0F38.W0 AD /r VFNMADD213SS xmm0, xmm1, xmm2/m128 +vfnmadd213ss macro x:req, y:req, z:req + %avx20_float x, y, z, 0ADH +endm +; VEX.DDS.128/256.66.0F38.W0 BD /r VFNMADD231SS xmm0, xmm1, xmm2/m128 +vfnmadd231ss macro x:req, y:req, z:req + %avx20_float x, y, z, 0BDH +endm + +; VEX.DDS.128/256.66.0F38.W1 9E /r VFNMSUB132PD xmm0, xmm1, xmm2/m128 +vfnmsub132pd macro x:req, y:req, z:req + %avx20_double x, y, z, 9EH +endm +; VEX.DDS.128/256.66.0F38.W1 AE /r VFNMSUB213PD xmm0, xmm1, xmm2/m128 +vfnmsub213pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0AEH +endm +; VEX.DDS.128/256.66.0F38.W1 BE /r VFNMSUB231PD xmm0, xmm1, xmm2/m128 +vfnmsub231pd macro x:req, y:req, z:req + %avx20_double x, y, z, 0BEH +endm + +; VEX.DDS.128/256.66.0F38.W0 9E /r VFNMSUB132PS xmm0, xmm1, xmm2/m128 +vfnmsub132ps macro x:req, y:req, z:req + %avx20_float x, y, z, 9EH +endm +; VEX.DDS.128/256.66.0F38.W0 AE /r VFNMSUB213PS xmm0, xmm1, xmm2/m128 +vfnmsub213ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0AEH +endm +; VEX.DDS.128/256.66.0F38.W0 BE /r VFNMSUB231PS xmm0, xmm1, xmm2/m128 +vfnmsub231ps macro x:req, y:req, z:req + %avx20_float x, y, z, 0BEH +endm + +; VEX.DDS.128/256.66.0F38.W1 9F /r VFNMSUB132SD xmm0, xmm1, xmm2/m128 +vfnmsub132sd macro x:req, y:req, z:req + %avx20_double x, y, z, 9FH +endm +; VEX.DDS.128/256.66.0F38.W1 AF /r VFNMSUB213SD xmm0, xmm1, xmm2/m128 +vfnmsub213sd macro x:req, y:req, z:req + %avx20_double x, y, z, 0AFH +endm +; VEX.DDS.128/256.66.0F38.W1 BF /r VFNMSUB231SD xmm0, xmm1, xmm2/m128 +vfnmsub231sd macro x:req, y:req, z:req + %avx20_double x, y, z, 0BFH +endm + +; VEX.DDS.128/256.66.0F38.W0 9F /r VFNMSUB132SS xmm0, xmm1, xmm2/m128 +vfnmsub132ss macro x:req, y:req, z:req + %avx20_float x, y, z, 9FH +endm +; VEX.DDS.128/256.66.0F38.W0 AF /r VFNMSUB213SS xmm0, xmm1, xmm2/m128 +vfnmsub213ss macro x:req, y:req, z:req + %avx20_float x, y, z, 0AFH +endm +; VEX.DDS.128/256.66.0F38.W0 BF /r VFNMSUB231SS xmm0, xmm1, xmm2/m128 +vfnmsub231ss macro x:req, y:req, z:req + %avx20_float x, y, z, 0BFH +endm + +; substitution because of a bug in ml10.0 version 10.00.30128.01 +; VEX.256.66.0F3A 19 /r ib + OPTION NOKEYWORD:< vextractf128> + vextractf128 macro xx:req, yy:req, imm:req + local x1, x2, q, f, z, memoprndl, memoprndu, memopl, memopu + memoprndl textequ <word> + memoprndu textequ <WORD> + memopl INSTR <xx>, memoprndl + memopu INSTR <xx>, memoprndu + IF (memopl+memopu) GT 0 + q textequ <ymm> + f SUBSTR <xx>, 2 + z CATSTR q, f + ELSE + q textequ <y> + f SUBSTR <xx>, 2 + z CATSTR q, f + ENDIF + x1: + vpermilpd yy, z, imm + x2: + org x1+3 + db 19H + org x2 + endm +; AVX2 (HSW) + +getW0W1 MACRO reg:req, opc3:req, w0w1:req + IS_XMMALL reg, x + IF x EQ 0 + opc3 = 085H + ELSE + opc3 = 081H + ENDIF + %FOR num,ALL_NUM + IF @InStr( , reg, num ) NE 0 + EXITM + ENDIF + opc3 = opc3 + 8 + ENDM + IF w0w1 EQ 0 + opc3 = opc3 - 80H + ENDIF +endm + +;VEX.NDS.128.66.0F38.W0 47 /r +vpsllvd MACRO op1:req, op2:req, op3:req +local x0, x1 + x0: + vpermilps op1, op2, op3 + x1: + org x0+3 + db 47H + org x1 +endm + +;VEX.NDS.128.66.0F38.W1 47 /r +vpsllvq MACRO op1:req, op2:req, op3:req +local x0, x1 + x0: + vpermilps op1, op2, op3 + x1: + org x0+2 + getW0W1 <op2>, opc3, 1 + db opc3 + db 47H + org x1 +endm +ENDIF ; IFNDEF ML1100 + +;IFNDEF ML1200 +; BDW MACRO for ML1100 adox & adcx + +ALL_XMM textequ <!<xmm0,XMM0,xmm1,XMM1,xmm2,XMM2,xmm3,XMM3,xmm4,XMM4,xmm5,XMM5,xmm6,XMM6,xmm7,XMM7,xmm8,XMM8,xmm9,XMM9,xmm10,XMM10,xmm11,XMM11,xmm12,XMM12,xmm13,XMM13,xmm14,XMM14,xmm15,XMM15!>> +ALL_GPR textequ <!<ax,AX,cx,CX,dx,DX,bx,BX,sp,SP,bp,BP,si,SI,di,DI,r8,R8,r9,R9,r10,R10,r11,R11,r12,R12,r13,R13,r14,R14,r15,R15!>> +REX_GPR textequ <!<r8,R8,r9,R9,r10,R10,r11,R11,r12,R12,r13,R13,r14,R14,r15,R15!>> +DD_GPR textequ <!<eax,EAX,ecx,ECX,edx,EDX,ebx,EBX,esp,ESP,ebp,EBP,esi,ESI,edi,EDI,r8d,R8D,r9d,R9D,r10d,R10D,r11d,R11D,r12d,R12D,r13d,R13D,r14d,R14D,r15d,R15D!>> + +REPLACE_GPR MACRO x ; this macro substites any GPR register + xretxmm textequ <> ; with XMM equivalent (with the same index in mod/r/m byte) + gpridx = 0 + %FOR igpr,ALL_GPR + IF @InStr(,x,igpr) NE 0 + xmmidx = 0 + %FOR ixmm,ALL_XMM + IF xmmidx EQ gpridx + xretxmm textequ <ixmm> + EXITM xretxmm + ENDIF ; if idx xmm & gpr is EQ + xmmidx = xmmidx + 1 + ENDM ; for ixmm + IF @SizeStr(%xretxmm) GT 0 + EXITM xretxmm + ENDIF + ENDIF + gpridx = gpridx + 1 + ENDM ; for igpr + EXITM xretxmm ; if replacement has not been found - return empty string that will cause ASM error +ENDM + +TEST_REX MACRO x:req, y:req, rex:req, bit64:req + rex = 0 + %FOR igpr,REX_GPR + IF @InStr(,x,igpr) NE 0 + rex = 1 + EXITM + ENDIF + IF @InStr(,y,igpr) NE 0 + rex = 1 + EXITM + ENDIF + ENDM ; for igpr + bit64 = 1 + %FOR igpr,DD_GPR + IFIDN <igpr>, <x> + bit64 = 0 + EXITM + ENDIF + ENDM ; for igpr +ENDM + +IFDEF ML1200 + +OPTION NOKEYWORD:<adcx> +OPTION NOKEYWORD:<adox> + +ENDIF + +; REX.W 66.0F38.F6/r +adcx MACRO op1:req, op2:req + local x0, x1, rex + op1subst textequ REPLACE_GPR( op1 ) + TEST_REX op1, op2, rex, bit64 + rex = rex + 2 + if bit64 GT 0 + x0: + pinsrq op1subst, op2, 0 + x1: + org x0 + 3 + db 038H + db 0F6H + org x1 - 1 + else + x0: + pinsrd op1subst, op2, 0 + x1: + org x0 + rex + db 038H + db 0F6H + org x1 - 1 + endif +endm + +; REX.W F3.0F38.F6/r +adox MACRO op1:req, op2:req + local x0, x1, rex + op1subst textequ REPLACE_GPR( op1 ) + TEST_REX op1, op2, rex, bit64 + rex = rex + 2 + if bit64 GT 0 + x0: + pinsrq op1subst, op2, 0 + x1: + org x0 + db 0F3H + org x0 + 3 + db 038H + db 0F6H + org x1 - 1 + else + x0: + pinsrd op1subst, op2, 0 + x1: + org x0 + db 0F3H + org x0 + rex + db 038H + db 0F6H + org x1 - 1 + endif +endm +;ENDIF ; IFNDEF ML1200 + + +;IFNDEF ML1400 +IFDEF ML1400 + OPTION NOKEYWORD:<sha1rnds4> + OPTION NOKEYWORD:<sha1nexte> + OPTION NOKEYWORD:<sha1msg1> + OPTION NOKEYWORD:<sha1msg2> + OPTION NOKEYWORD:<sha256rnds2> + OPTION NOKEYWORD:<sha256msg1> + OPTION NOKEYWORD:<sha256msg2> +ENDIF + +HIGHQ_GPR textequ <!<r8,R8,r9,R9,r10,R10,r11,R11,r12,R12,r13,R13,r14,R14,r15,R15!>> +LOWQ_GPR textequ <!<rax,RAX,rcx,RCX,rdx,RDX,rbx,RBX,rsp,RSP,rbp,RBP,rsi,RSI,rdi,RDI!>> +HIGH_XMM textequ <!<xmm8,XMM8,xmm9,XMM9,xmm10,XMM10,xmm11,XMM11,xmm12,XMM12,xmm13,XMM13,xmm14,XMM14,xmm15,XMM15!>> +LOW_XMM textequ <!<xmm0,XMM0,xmm1,XMM1,xmm2,XMM2,xmm3,XMM3,xmm4,XMM4,xmm5,XMM5,xmm6,XMM6,xmm7,XMM7!>> +HIGHDQ_GPR textequ <!<R8D,r8d,R8,r8,R9D,r9d,R9,r9,R10D,r10d,R10,r10,R11D,r11d,R11,r11,R12D,r12d,R12,r12,R13D,r13d,R13,r13,R14D,r14d,R14,r14,R15D,r15d,R15,r15!>> +LOWDQ_GPR textequ <!<EAX,eax,RAX,rax,ECX,ecx,RCX,rcx,EDX,edx,RDX,rdx,EBX,ebx,RBX,rbx,ESP,esp,RSP,rsp,EBP,ebp,RBP,rbp,ESI,esi,RSI,rsi,EDI,edi,RDI,rdi!>> +LOWD_GPR textequ <!<eax,EAX,ecx,ECX,edx,EDX,ebx,EBX,esp,ESP,ebp,EBP,esi,ESI,edi,EDI!>> +HIGHD_GPR textequ <!<r8d,R8D,r9d,R9D,r10d,R10D,r11d,R11D,r12d,R12D,r13d,R13D,r14d,R14D,r15d,R15D!>> +LOWW_GPR textequ <!<ax,AX,cx,CX,dx,DX,bx,BX,sp,SP,bp,BP,si,SI,di,DI!>> +HIGHW_GPR textequ <!<r8w,R8W,r9w,R9W,r10w,R10W,r11w,R11W,r12w,R12W,r13w,R13W,r14w,R14W,r15w,R15W!>> +LOWB_GPR textequ <!<al,AL,cl,CL,dl,DL,bl,BL,ah,AH,ch,CH,dh,DH,bh,BH!>> +HIGHB_GPR textequ <!<r8b,R8B,r9b,R9B,r10b,R10B,r11b,R11B,r12b,R12B,r13b,R13B,r14b,R14B,r15b,R15B,spl,SPL,bpl,BPL,sil,SIL,dil,DIL!>> +ALL_NUM textequ <!<15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0!>> + +YES_REX MACRO x, REX + REX = 0 + %FOR yrex,HIGH_XMM ; if xmm from 8-15 range - REX byte is required + IFIDN <yrex>,<x> + REX = 1 + EXITM + ENDIF + ENDM + IF REX EQ 0 + %FOR yrex,HIGHDQ_GPR ; if gpr from 8-15 range - REX byte is required + IF @InStr( , x, yrex ) NE 0 + REX = 1 + EXITM + ENDIF + ENDM + ENDIF +ENDM + +CVT_GPR MACRO x ; this macro substites any gpr from the high half (8-15) + xretgpr textequ <x> ; with the gpr from the low half wich produces the same + qgpr = 0 ; index in the mod/r/m and sib bytes + %FOR ygpr,HIGHDQ_GPR + posgpr INSTR <x>,<ygpr> + IF posgpr GT 0 + fgpr = 0 + %FOR zgpr,LOWDQ_GPR + IF fgpr EQ qgpr + f1gpr SUBSTR <x>, 1, posgpr-1 + f2gpr SUBSTR <x>, posgpr + @SizeStr( ygpr ) + xretgpr CATSTR <f1gpr>, < zgpr >, <f2gpr> + EXITM xretgpr + ENDIF ; if f == q + fgpr = fgpr + 1 + ENDM ; for z + ENDIF ; if posx > 0 + qgpr = qgpr + 1 + ENDM ; for y + EXITM xretgpr +ENDM + +CVT_XMM MACRO x ; this macro substites any xmm from the high half (8-15) + xretxmm textequ <x> ; with the xmm from the low half wich produces the same + lxmm = 0 ; index in the mod/r/m byte + %FOR yxmm,HIGH_XMM + posxmm INSTR <x>,<yxmm> + IF posxmm GT 0 + fxmm = 0 + %FOR zxmm,LOW_XMM + IF fxmm EQ lxmm + xretxmm textequ <zxmm> + EXITM xretxmm + ENDIF ; if f == l + fxmm = fxmm + 1 + ENDM ; for z + ENDIF ; if posx > 0 + lxmm = lxmm + 1 + ENDM ; for y + EXITM xretxmm +ENDM + +CVT_HIGH MACRO x ; a wrapper for macros that substitute up-half registers + xs textequ CVT_GPR( x ) ; with their ia32 analogues that have the same index in + xs1 textequ CVT_GPR( %xs ) ; the mod/r/m byte + xs2 textequ CVT_XMM( %xs1 ) + EXITM xs2 +ENDM + +YES_NAME MACRO x ; if "x" contains direct reference to memory operand (by + znam = ( OPATTR( x )) AND 0011110111y ; name defined in code or data section) 1 is returned + IF znam EQ 0 ; else 0 + xnam = 1 + ELSE + xnam = 0 + ENDIF + EXITM %xnam +ENDM + +CVT_MIMM MACRO x, y ; if "x" contains direct reference to memory operand (by + zimm = ( OPATTR( x )) AND 0011110111y ; name defined in code or data section) it is substituted + IF zimm EQ 0 ; by "y" operand in order to produce right REX byte, but + ximm textequ <y> ; don't produce relocation record (because current address + ELSE ; for relocation due to different instruction length is wrong) + ximm textequ <x> + ENDIF + EXITM ximm +ENDM + +sha_instruction macro dst:req, src:req, nis:req, opc:req, imm8 + local x0, x1, x2, x3, x4, x5, x6, x7 + + bracket INSTR <src>,<[> + IF bracket GT 0 + memtype INSTR <src>,<oword> + IF memtype EQ 0 + memtype INSTR <src>,<OWORD> + ENDIF + IF memtype EQ 0 + .ERR <src must contain: oword ptr > + EXITM + ENDIF + ENDIF + bracket INSTR <dst>,<[> + IF bracket GT 0 + memtype INSTR <dst>,<oword> + IF memtype EQ 0 + memtype INSTR <dst>,<OWORD> + ENDIF + IF memtype EQ 0 + .ERR <dst must contain: oword ptr > + EXITM + ENDIF + ENDIF + YES_REX <src>,REX ; do we need REX byte due to src operand? + REXS = REX + IF REXS EQ 1 ; if yes - we have to prepare substitution in order + s1rc textequ CVT_HIGH( src ) ; to work correctly with direct memory operands + ELSE + s1rc textequ <src> ; else substitution is not required + ENDIF + YES_REX <dst>,REX ; do we need REX byte due to dst operand? + REXD = REX + IF REXD EQ 1 ; if yes - we have to prepare substitution in order + d1st textequ CVT_HIGH( dst ) ; to work correctly with direct memory operands + ELSE + d1st textequ <dst> ; else substitution is not required + ENDIF + REX = REXS + REXD + NAMS = YES_NAME( src ) ; is there the direct memory operand (defined by name in code + NAMD = YES_NAME( dst ) ; or data section)? if yes - then another algorithm for macro + isname = NAMS + NAMD ; substitution due to bug in ml with relocations definition + s2rc textequ CVT_MIMM( src, xmm0 ) + d2st textequ CVT_MIMM( dst, xmm0 ) + IF isname GT 0 ; if src or dst contains direct reference to memory operand + IF REX GT 0 + x0: + nop + nop + movaps d1st,s1rc ; 90 90 0F 28 /r m32 + x1: + org x0 + movaps d2st,s2rc ; REX 0F 28 /r /r m32 + org x0+2 + db nis + db opc + IFNB <imm8> + org x0+5 + dd 0FFFFFFFFH + org x1 ; REX 0F nis opc /r m32 + db imm8 + ELSE + org x1 + ENDIF + ELSE + x2: + nop + movaps dst, src ; 90 0F 28 /r m32 + x3: + org x2 + db 0FH + db nis + db opc + IFNB <imm8> + org x2+4 + dd 0FFFFFFFFH + org x3 ; 0F nis opc /r m32 + db imm8 + ELSE + org x3 + ENDIF + ENDIF + ELSE ; if src or dst doesn't contain direct reference to memory operand + IF REX GT 0 + x4: + movaps dst,src ; REX 0F 28 /r + org x4+1 + movaps dst,src ; REX REX 0F 28 /r + x5: + org x4+1 + db 0FH + db nis + db opc + org x5 ; REX 0F nis opc /r + IFNB <imm8> + db imm8 + ENDIF + ELSE + x6: + nop + movaps dst, src ; 90 0F 28 /r + x7: + org x6 + db 0FH + db nis + db opc + org x7 ; 0F nis opc /r + IFNB <imm8> + db imm8 + ENDIF + ENDIF + ENDIF +endm + +; 0F 3A CC /r ib +sha1rnds4 MACRO op1:req, op2:req, imm8:req + sha_instruction op1, op2, 3AH, 0CCH, imm8 +endm + +; 0F 38 C8 /r +sha1nexte MACRO op1:req, op2:req + sha_instruction op1, op2, 38H, 0C8H, +endm + +; 0F 38 C9 /r +sha1msg1 MACRO op1:req, op2:req + sha_instruction op1, op2, 38H, 0C9H, +endm + +; 0F 38 CA /r +sha1msg2 MACRO op1:req, op2:req + sha_instruction op1, op2, 38H, 0CAH, +endm + +; 0F 38 CB /r <xmm0> +sha256rnds2 MACRO op1:req, op2:req + sha_instruction op1, op2, 38H, 0CBH, +endm + +; 0F 38 CC /r +sha256msg1 MACRO op1:req, op2:req + sha_instruction op1, op2, 38H, 0CCH, +endm + +; 0F 38 CD /r +sha256msg2 MACRO op1:req, op2:req + sha_instruction op1, op2, 38H, 0CDH, +endm + +;ENDIF ;ML1400 + +ENDIF ; MNI & SNI macro for Linux or for Windows + + +IF 0 +;; The example of macro usage: +.code + +my PROC NEAR PUBLIC + ;; The GPRs (general purpose registers) to be preserved (if used): + ;; rbp, rbx, rsi, rdi, r12, r13, r14, r15. + USES_GPR rbx, rsi, rdi, rbp, rax, r12 + ;; Local frame must be allways set (to zero, if it is not used). + LOCAL_FRAME = 100 + ;; The XMM registers to be preserved (if used): + ;; XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15 + USES_XMM xmm4,xmm7,xmm11 + ;; Number of input parameters. + COMP_ABI 9 + nop + ;; Restore all saved XMMs. + REST_XMM + ;; Restore all saved GPRs. + REST_GPR + ret +my ENDP + +END +ENDIF + +CACHE_SIZE_TABLE MACRO +TableCacheSize: +;========================================= +; Code: bits [7-4] - code_of_size +; Code: bits [3-0] - shift +; CACHE_SIZE = code_of_size << (shift + 18) +; |Value| |Code| +;========================================= +db 0ech, 0c3h ; 24M 24, 64, L3 ; from doc cpuid for Nehalem +db 0ebh, 093h ; 18M 24, 64, L3 ; from doc cpuid for Nehalem +db 04dh, 016h ; 16M 16, 64, L3 +db 0eah, 034h ; 12M 24, 64, L3 ; from doc cpuid for Nehalem +db 04ch, 034h ; 12M 12, 64, L3 +db 0e4h, 015h ; 8M 16, 64, L3 ; from doc cpuid for Nehalem +db 0deh, 015h ; 8M 12, 64, L3 ; from doc cpuid for Nehalem +db 04bh, 015h ; 8M 16, 64, L3 +db 047h, 015h ; 8M 8, 64, L3 +db 04eh, 033h ; 6M 24, 64, L3 +db 04ah, 033h ; 6M 12, 64, L3 +db 0e3h, 014h ; 4M 16, 64, L3 ; from doc cpuid for Nehalem +db 0ddh, 014h ; 4M 12, 64, L3 ; from doc cpuid for Nehalem +db 0d8h, 014h ; 4M 8, 64, L3 ; from doc cpuid for Nehalem +db 049h, 014h ; 4M 16, 64, L3 +db 029h, 014h ; 4M 8, 64, L3 +db 046h, 014h ; 4M 4, 64, L3 +db 048h, 032h ; 3M 12, 64, L3 +db 0e2h, 013h ; 2M 16, 64, L3 ; from doc cpuid for Nehalem +db 0dch, 013h ; 2M 12, 64, L3 ; from doc cpuid for Nehalem +db 0d7h, 013h ; 2M 8, 64, L3 ; from doc cpuid for Nehalem +db 0d2h, 013h ; 2M 4, 64, L3 ; from doc cpuid for Nehalem +db 025h, 013h ; 2M 8, 64, L3 +db 07dh, 013h ; 2M 8, 64, L2 +db 085h, 013h ; 2M 8, 32, L2 +db 045h, 013h ; 2M 4, 32, L2 +db 0d6h, 012h ; 1M 8, 64, L3 ; from doc cpuid for Nehalem +db 0d1h, 012h ; 1M 4, 64, L3 ; from doc cpuid for Nehalem +db 023h, 012h ; 1M 8, 64, L3 +db 087h, 012h ; 1M 8, 64, L2 +db 07ch, 012h ; 1M 8, 64, L2 +db 078h, 012h ; 1M 4, 64, L2 +db 084h, 012h ; 1M 8, 32, L2 +db 044h, 012h ; 1M 4, 32, L2 +db 0d0h, 011h ; 512K 4, 64, L3 ; from doc cpuid for Nehalem +db 022h, 011h ; 512K 4, 64, L3 +db 07bh, 011h ; 512K 8, 64, L2 +db 080h, 011h ; 512K 8, 64, L2 +db 086h, 011h ; 512K 4, 64, L2 +db 03eh, 011h ; 512K 4, 64, L2 +db 07fh, 011h ; 512K 2, 64, L2 +db 083h, 011h ; 512K 8, 32, L2 +db 043h, 011h ; 512K 4, 32, L2 +db 0 +;========================================= +ENDM + +GET_CACHE_SIZE MACRO reg:REQ +;========================================= + sub rsp, 64 + mov [rsp + 16], rax + mov [rsp + 24], rbx + mov [rsp + 32], rcx + mov [rsp + 40], rdx + mov [rsp + 48], r8 + mov [rsp + 56], reg ; Pointers to the TableCacheSize + + xor eax, eax + cpuid + + cmp ebx, 756E6547h + jne CacheSizeMacro11 ; Not Intel + cmp edx, 49656E69h + jne CacheSizeMacro11 ; Not Intel + cmp ecx, 6c65746eh + jne CacheSizeMacro11 ; Not Intel + + mov eax, 2 + cpuid + + cmp al, 1 + jne CacheSizeMacro11 + + test eax, 080000000h + jz CacheSizeMacro00 + xor eax, eax +CacheSizeMacro00: + test ebx, 080000000h + jz CacheSizeMacro01 + xor ebx, ebx +CacheSizeMacro01: + test ecx, 080000000h + jz CacheSizeMacro02 + xor ecx, ecx +CacheSizeMacro02: + test edx, 080000000h + jz CacheSizeMacro03 + xor edx, edx + +CacheSizeMacro03: + mov r8, rsp + test eax, eax + jz CacheSizeMacro04 + mov [r8], eax + add r8, 4 + mov eax, 3 +CacheSizeMacro04: + test ebx, ebx + jz CacheSizeMacro05 + mov [r8], ebx + add r8, 4 + add eax, 4 +CacheSizeMacro05: + test ecx, ecx + jz CacheSizeMacro06 + mov [r8], ecx + add r8, 4 + add eax, 4 +CacheSizeMacro06: + test edx, edx + jz CacheSizeMacro07 + mov [r8], edx + add eax, 4 + +CacheSizeMacro07: + mov rbx, [rsp + 56] ; rbx: Pointers to the TableCacheSize + + test eax, eax + jz CacheSizeMacro11 +CacheSizeMacro08: + movzx edx, BYTE PTR [rbx] + test edx, edx + jz CacheSizeMacro11 + add rbx, 2 + mov ecx, eax +CacheSizeMacro09: + cmp dl, BYTE PTR [rsp + rcx] + je CacheSizeMacro10 + sub ecx, 1 + jnz CacheSizeMacro09 + jmp CacheSizeMacro08 + +CacheSizeMacro10: + movzx ebx, BYTE PTR [rbx - 1] + mov ecx, ebx + shr ebx, 4 + and ecx, 0fh + add ecx, 18 + shl rbx, cl ; ebx: CacheSize + mov [rsp + 56], rbx + jmp CacheSizeMacro12 + +CacheSizeMacro11: + mov QWORD PTR [rsp + 56], -1 + +CacheSizeMacro12: + mov rax, [rsp + 16] + mov rbx, [rsp + 24] + mov rcx, [rsp + 32] + mov rdx, [rsp + 40] + mov r8, [rsp + 48] + mov reg, [rsp + 56] + add rsp, 64 +;========================================= +ENDM + +GET_CACHE_SIZE_CORE MACRO reg:REQ +;========================================= + sub rsp, 72 + mov [rsp + 16], rax + mov [rsp + 24], rbx + mov [rsp + 32], rcx + mov [rsp + 40], rdx + mov [rsp + 48], r8 + mov [rsp + 56], reg ; Pointers to the TableCacheSize + + xor eax, eax + cpuid + + cmp ebx, 756E6547h + jne CacheSizeMacro11 ; Not Intel + cmp edx, 49656E69h + jne CacheSizeMacro11 ; Not Intel + cmp ecx, 6c65746eh + jne CacheSizeMacro11 ; Not Intel + + cmp eax, 4 + jl CoreMacro00 + + mov eax, 4 + xor ecx, ecx + cpuid + shr eax, 26 + add eax, 1 + mov [rsp + 64], rax ; cores + jmp CacheSizeMacro + +CoreMacro00: + mov QWORD PTR [rsp + 64], 1 + +CacheSizeMacro: + mov eax, 2 + cpuid + + cmp al, 1 + jne CacheSizeMacro11 + + test eax, 080000000h + jz CacheSizeMacro00 + xor eax, eax +CacheSizeMacro00: + test ebx, 080000000h + jz CacheSizeMacro01 + xor ebx, ebx +CacheSizeMacro01: + test ecx, 080000000h + jz CacheSizeMacro02 + xor ecx, ecx +CacheSizeMacro02: + test edx, 080000000h + jz CacheSizeMacro03 + xor edx, edx + +CacheSizeMacro03: + mov r8, rsp + test eax, eax + jz CacheSizeMacro04 + mov [r8], eax + add r8, 4 + mov eax, 3 +CacheSizeMacro04: + test ebx, ebx + jz CacheSizeMacro05 + mov [r8], ebx + add r8, 4 + add eax, 4 +CacheSizeMacro05: + test ecx, ecx + jz CacheSizeMacro06 + mov [r8], ecx + add r8, 4 + add eax, 4 +CacheSizeMacro06: + test edx, edx + jz CacheSizeMacro07 + mov [r8], edx + add eax, 4 + +CacheSizeMacro07: + mov rbx, [rsp + 56] ; rbx: Pointers to the TableCacheSize + + test eax, eax + jz CacheSizeMacro11 +CacheSizeMacro08: + movzx edx, BYTE PTR [rbx] + test edx, edx + jz CacheSizeMacro11 + add rbx, 2 + mov ecx, eax +CacheSizeMacro09: + cmp dl, BYTE PTR [rsp + rcx] + je CacheSizeMacro10 + sub ecx, 1 + jnz CacheSizeMacro09 + jmp CacheSizeMacro08 + +CacheSizeMacro10: + movzx eax, BYTE PTR [rbx - 1] + mov ecx, eax + shr eax, 4 + and ecx, 0fh + add ecx, 18 + shl rax, cl ; rax: CacheSize + mov rcx, [rsp + 64] ; rcx: cores + xor edx, edx + div rcx + mov [rsp + 56], rax + jmp CacheSizeMacro12 + +CacheSizeMacro11: + mov QWORD PTR [rsp + 56], -1 + +CacheSizeMacro12: + mov rax, [rsp + 16] + mov rbx, [rsp + 24] + mov rcx, [rsp + 32] + mov rdx, [rsp + 40] + mov r8, [rsp + 48] + mov reg, [rsp + 56] + add rsp, 72 +;========================================= +ENDM + +.LIST + |